Пример #1
0
def data_check(file_name):
    '''
    This piece of code performs the following operations:
    1. Iterates through each datafiles and checks if new data
       was uploaded or not. There are situations when the webpage
       reject the requests made by the scrape, which leads to missin
       data if not tracked correctly.
    2. Sends notifications to slack
    '''

    with open(
            '/Users/nikhilsawal/OneDrive/investment_portfolio/datafiles/{}'.
            format(file_name), 'rb') as inputfile:
        now = datetime.now()
        now = now.strftime('%Y-%m-%d %H')
        all_dates = []
        for item in json_lines.reader(inputfile):
            date_val = datetime.strptime(item['datetime'], '%Y-%m-%d %H:%M:%S')
            all_dates.append(date_val.strftime('%Y-%m-%d %H'))

        if now in all_dates:

            unicode = "\u2705"
            status = "Success"
            description = 'Data added to {}'.format(file_name)

        else:

            unicode = "\u274C"
            status = "Fail"
            description = 'Data NOT added to {}'.format(file_name)

    # Send Slack notifications
    hf.slack_msg("""
    ```
    datafile: {},
    status: {},
    description: {}
    ```
    """.format(unicode + file_name, status, description))
Пример #2
0
def get_debug_for_cluster(repo, graph, cluster_uri):

    did = repo
    if graph:
        did = repo + '-' + re.sub('[^0-9a-zA-Z]+', '-', graph)

    # get debug file for repo/graph if hasn't been loaded
    if did not in debugs:
        debugs[did] = []
        debug_file = 'debug/' + did + '.jl'
        if os.path.isfile(debug_file):
            with open(debug_file, 'r') as f:
                for line in json_lines.reader(f):
                    debugs[did].append(line)
        else:
            return None

    entity_uri = cluster_uri.replace('-cluster', '')
    for debug in debugs[did]:
        if entity_uri in debug['all_records']:
            return debug
    return None  # not found
Пример #3
0
def _load_annotationsQA_R(annotations_jsonpath, split):
    """
    Build an index out of FOIL annotations, mapping each image ID with its corresponding captions.
    """
    entries = []
    with open(annotations_jsonpath, 'rb') as f:
        for annotation in json_lines.reader(f):
            if split == 'test':
                for answer in annotation["answer_choices"]:
                    question = annotation["question"] + ["[MARK]"] + answer
                    img_id = _converId(annotation["img_id"])
                    ans_label = 0
                    anno_id = int(annotation["annot_id"].split('-')[1])
                    entries.append({
                        "question": question,
                        "answers": annotation["rationale_choices"],
                        "metadata_fn": annotation["metadata_fn"],
                        "target": ans_label,
                        "img_id": img_id,
                        "anno_id": anno_id,
                        "det_names": annotation['objects']
                    })
            else:
                det_names = ""
                question = annotation["question"] + ["[MARK]"]  + \
                               annotation["answer_choices"][annotation['answer_label']]
                ans_label = annotation["rationale_label"]
                img_id = _converId(annotation["img_id"])
                anno_id = int(annotation["annot_id"].split('-')[1])
                entries.append({
                    "question": question,
                    "answers": annotation["rationale_choices"],
                    "metadata_fn": annotation["metadata_fn"],
                    "target": ans_label,
                    "img_id": img_id,
                    "anno_id": anno_id,
                    "det_names": annotation['objects']
                })
    return entries
Пример #4
0
def get_multinli(data_path: str,
                 prefix: str,
                 suffix: str,
                 dataset: str,
                 genres: list = None) -> dict:
    path = os.path.join(data_path, prefix + dataset + suffix)

    labels = {'entailment': 0, 'neutral': 1, 'contradiction': 2}

    with open(path) as f:
        data = [item for item in json_lines.reader(f)]

    s1, s2, label = [], [], []

    for entry in data:
        if genres is None or entry['genre'] in genres:
            if entry['gold_label'] in labels:
                s1.append(entry['sentence1'])
                s2.append(entry['sentence2'])
                label.append(labels[entry['gold_label']])

    return {'s1': s1, 's2': s2, 'label': label}
def indexing():
    #Reading the jsonl file
    with open('sample-1M.jsonl', 'rb') as f:
        i = 1

        #Indexing 10000 values
        for item in json_lines.reader(f):
            if (i >= 5000):
                continue
            else:

                #Converting string formatted data to json file format
                item = json.dumps(item)

                #Reading the file in order to load it to the elasticsearch
                decoded = json.loads(item)

                #Adding to the index
                es.index(index='news_article',
                         doc_type='articles',
                         id=i,
                         body=decoded)
            i += 1
Пример #6
0
def parse_json(input_file, output_file):
    with open(output_file, encoding='utf8', mode='w',
              newline='') as features_file:
        features_writer = csv.writer(features_file,
                                     delimiter=',',
                                     quotechar='',
                                     quoting=csv.QUOTE_NONE)
        features_writer.writerow(['Tweet_ID', 'Crowd_Label'])
        with open(input_file, 'rb') as f:
            for tweet_result in json_lines.reader(f):
                # Credible tweets
                if tweet_result['results']['sentiment']['agg'] == 'definetly_credible' \
                        or tweet_result['results']['sentiment']['agg'] == 'seems_credible':
                    features_writer.writerow(
                        [tweet_result['data']['tweet_id'], 1])
                # Not credible tweets
                elif tweet_result['results']['sentiment'][
                        'agg'] == 'definitely_not_credible':
                    features_writer.writerow(
                        [tweet_result['data']['tweet_id'], 0])
                # Skipping cannot decide and none
                else:
                    continue
def loadJsonlData(file: str) -> list:
    '''
    Reads the data as saved in a .jsonl file
    
    Args:
    ----
    file: String corresponding to the path to a .jsonl file which contains the 
          tweets as received from the TwitterAPI.

    Returns:
    -------
    tweets: A list of all the data saved in the .jsonl file.
    '''

    tweets = []
    with open(file, 'rb') as f:
        for tweet in json_lines.reader(f, broken=True):
            try:
                tweets.append(tweet)
            except json_lines.UnicodeDecodeError or json.JSONDecodeError:
                pass

        return tweets
Пример #8
0
def index_ngram():
    print('Indexing ngram...')
    schema = Schema(id=ID(stored=True),
                    question=NGRAM(minsize=2, maxsize=7),
                    answer=NGRAM(minsize=2, maxsize=7))
    if not os.path.exists('index_ngram'):
        os.mkdir('index_ngram')
    ix = create_in('index_ngram', schema)
    writer = ix.writer()
    with open(PATH_QUESTION_ANSWER, 'r') as f:
        for qa in json_lines.reader(f):
            # print(qa['question'])
            # print(qa['answer'])
            # print('\n')
            if not convenion.is_valid_qa(qa):
                continue
            question = convenion.customize_and_remove_stopword(qa['question'])
            answer = convenion.customize_and_remove_stopword(qa['answer'])
            writer.add_document(id=qa['id_cmt'],
                                question=question,
                                answer=answer)
        print('Commit ngram...')
        writer.commit()
Пример #9
0
def preprocess_ARC():
    if getpass.getuser() == 'Mitch':
        # directory on my computer
        head = 'C:/Users/Mitch/PycharmProjects'
    else:
        # directory on compute
        head = '/home/kinne174/private/PythonProjects'

    difficulties = ['Easy', 'Challenge']
    partitions = ['Train', 'Dev', 'Test']

    # all_filenames = ['ARC-' + '-'.join(dp) + '.jsonl' for dp in product(difficulties, partitions)]

    for d in difficulties:
        for p in partitions:
            output = []

            ARC_filename = 'ARC/ARC-V1-Feb2018-2/ARC-{}/ARC-{}-{}.jsonl'.format(d, d, p)
            dataset_filename = os.path.join(head, ARC_filename)

            if os.path.exists(dataset_filename):
                with open(dataset_filename, 'r', encoding='utf-8') as df:
                    data = {}
                    for ind, item in enumerate(json_lines.reader(df)):
                        data['id'] = item['id']
                        data['question'] = item['question']['stem']
                        data['choices_text'] = [choice['text'] for choice in item['question']['choices']]
                        data['choices_labels'] = [choice['label'] for choice in item['question']['choices']]
                        data['answer'] = item['answerKey']

                        output += [data]

            else:
                raise Exception("Filename {} does not exist!".format(dataset_filename))

            with open(os.path.join(head, 'hf_transformers/data/{}-{}.json'.format(d, p)), 'w') as of:
                json.dump(output, of)
    def preprocess_jsonl(self, input_file_path, max_token_num):
        """ handles reading data from input jsonl file and writing preprocessed data into a separate file
        Preprocessed data is in the json format: np array of {"sentence1":..., "sentence2":...,
        "gold_label": 1} semi sorted

        1. Extracting setences and gold label from jsonl file, removing instances with label "-" for
           gold label from the dataset
        2. Prepending each sentence with the NULL token
        3. Adding padding to the sentences to the maximum length to 20 words
        Args:
            input_file_path: path to file where the input jsonl is
            max_token_num: the number of tokens of the sentences we are adding padding to
        Returns:
            np array of new dictionaries  {"sentence1":..., "sentence2":..., "gold_label": [1, 0, 0]} semi sorted
        """
        data_list = []
        with open(input_file_path,
                  'rb') as input_file:  # opening file in binary(rb) mode
            for item in json_lines.reader(input_file):
                if item["gold_label"] != "-":  # Removing unlabeled data
                    new_item = {}
                    # Prepending sentences with the NULL token
                    token_array1 = ('\0 ' + item["sentence1"]).split()
                    token_array2 = ('\0 ' + item["sentence2"]).split()

                    if len(token_array1) <= 20 and len(token_array2) <= 20:
                        new_item["sentence1"] = self.pad_sentence(
                            token_array1, max_token_num)
                        new_item["sentence2"] = self.pad_sentence(
                            token_array2, max_token_num)
                        new_item["gold_label"] = self.GOLD_LABELS[item[
                            "gold_label"]]  # Converting gold label to vector representation
                        data_list.append(new_item)

            random.shuffle(data_list)
            return np.array(data_list)
    def parse(self, response):

        doc = response.css('body')
        with open('C:\\Users\\Ron\\git\\docSpider\\nnames.jl', 'rb') as f:
            for item in json_lines.reader(f):
                if response.url == item['url']:
                    names = item['result']['PERSON']

        for name in names:
            print(name)
            yield {
                'name':
                name,
                'text': (doc.xpath('//*[contains(text(), "' + name +
                                   '")]/../../*/*/text()').extract()),
                'url':
                response.url
            }


#        next_page = response.xpath('.//a[contains(@class, "header")]/@href').extract_first()
#        if next_page is not None:
#            next_page = response.urljoin(next_page)
#            yield scrapy.Request(next_page, callback=self.parse)
Пример #12
0
def hashtagnetwork(filename, giant_component=False):
    """Generate Hashtag Network from Twitter data collection.

    Parameters:
    filename: path to jsonl twitter object to transform
    giant_component (boolean): keep only largest weakly connected component 
    
    Returns:
    igraph graph object: hashtag network where a link is created
    between i to j if i and j appear in the same tweet.
    """        
    edgelist = []
    with open(filename, 'rb') as f:
        for tweet in json_lines.reader(f):
            if len(tweet["entities"]["hashtags"]) > 1:
                cohashtags = []
                for element in tweet["entities"]["hashtags"]:
                    hashtag = element["text"]
                    cohashtags.append(hashtag)
                combs = list(combinations(cohashtags,2))
                for element in combs:
                    source = element[0]
                    target = element[1]
                    edgelist.append((source, target))

    H = ig.Graph.DictList(edges=(dict(source=source, target=target, weight=1) for source, target in edgelist), 
                          vertices=None, 
                          directed=False)

    if giant_component == True:
        H = H.components().giant()
        
    H.es['weight'] = 1
    #H = H.simplify(combine_edges=dict(weight="sum"))
    
    return(H)
Пример #13
0
def readData(file_name, word2vec=w2v, load_w2v=True):
    """
    Read the data. Filter out those neutral data.
    For datum that has multiple labels, select the most one as its label.

    Input: (str) file_name
    Output: (list) premise-hypothesis pairs
    """
    print "Reading data file %s..." % file_name

    ph_pairs = []
    label_cnt = {'entailment': 0, 'contradiction': 0, 'neutral': 0}
    corpus_dict = Lang('en')
    with open(file_name, 'rb') as f:
        for item in jsonl.reader(f):
            p = normalizeString(item['sentence1'])
            h = normalizeString(item['sentence2'])
            l = getLabel(item['annotator_labels'])
            label_cnt[l] += 1
            datum = phPair(p, h, l)
            if datum.label != 'neutral':
                ph_pairs.append(datum)
                corpus_dict.addSentence(p)
                corpus_dict.addSentence(h)

    print "Loading dataset completed !"
    print "Loading word2vec model..."

    glove = np.zeros((5,5))
    if load_w2v:
        glove = load_pretrained_embedding(corpus_dict, word2vec)
        print "Loading word2vec done!"

    print "Courpus used %d words" % corpus_dict.n_words
    print "Data distributions: %s" % label_cnt
    return ph_pairs, corpus_dict, glove
Пример #14
0
import json_lines as jl
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('-f',nargs='*')
args=parser.parse_args()

for arg in args.f:

    with open(arg) as f:
        data= jl.reader(f)
        for line in data:
            txt= open('dic.txt','a')
            txt.write(line['vin'] + '\n')
from bs4 import BeautifulSoup
import json_lines
import json

f = open("PsychologyToday_All.jl")

counter = 0
lister = []

fp = open("ParsedPsychologyToday.jl", "w")

for i in json_lines.reader(f):

    counter += 1
    print counter

    dicto = {}

    #url for webpage
    dicto["url"] = i["url"]

    #Group title
    soup = BeautifulSoup(i["raw_content"], "lxml")
    if len(soup.find_all(attrs={"class": "groups-section"})) > 0:
        x = BeautifulSoup(
            str(soup.find_all(attrs={"class": "groups-section"})[0]), "lxml")
        x = BeautifulSoup(str(x.find_all(attrs={"class": "group-title"})[0]),
                          "lxml")
        if x.h2 != None:
            dicto["group_name"] = x.h2.text
        elif x.h3 != None:
Пример #16
0
try:
    os.mkdir(options.opath + "/preprocessing_data/preprocess/" +
             urlprefixdomain + "/w2p/bitextorlang")
except FileExistsError:
    pass

try:
    os.mkdir(options.opath + "/preprocessing_data/preprocess/" +
             urlprefixdomain + "/w2p/bitextorlang/" + options.lang)
except FileExistsError:
    pass

outputpath = options.opath + "/preprocessing_data/preprocess/" + urlprefixdomain + "/w2p/bitextorlang/" + options.lang

json_file = gzip.open(options.newsfile, "rb")
with lzma.open(outputpath + "/url.xz", 'w') as urlfile, lzma.open(
        outputpath + "/plain_text.xz",
        'w') as bodyfile, lzma.open(outputpath + "/date.xz", 'w') as datefile:
    for newspiece in json_lines.reader(json_file):

        body = base64.b64encode(
            str.encode(newspiece["headline"] + "\n" + newspiece["body"]))
        date = dateutil.parser.parse(newspiece["firstPublished"])
        url = options.urlprefix + newspiece["id"]

        urlfile.write(str.encode(url + "\n"))
        datefile.write(
            str.encode("%04d%02d%02d\n" % (date.year, date.month, date.day)))
        bodyfile.write(str.encode(body.decode("utf-8") + "\n"))
def identifyClasses(pos_lemmaTagged_factFile, clusters_factFile):
    classifyVerbFileName = getFileNamePart(clusters_factFile,
                                           '.json') + "_classifyVerb.json"
    annotatedVerbFileName = getFileNamePart(clusters_factFile,
                                            '.json') + "_annotatedVerb.json"
    clustersDict = {}
    entityClusterDict = {}
    with open(clusters_factFile, 'r') as clusters_File:
        print("Reading Clusters...")
        clustersDict = json.load(clusters_File)
        pprint(clustersDict)
        for clusterNo in clustersDict:
            cluster_name = 'cluster_' + clusterNo
            for clusterItem in clustersDict[clusterNo]:
                entityClusterDict[clusterItem] = clusterNo  #cluster_name
    #pprint(entityClusterDict)

    clusterVerbDict = {}
    posLemmaVerbDict = {}
    with open(pos_lemmaTagged_factFile,
              'r') as pos_lemmaFile, open(clusters_factFile,
                                          'r') as clusters_File:
        for item in json_lines.reader(pos_lemmaFile):
            itemKeys = item.keys()
            if item['isFact']:
                pos_nn = ""
                if 'POS_NN' in itemKeys:
                    pos_nn = item['POS_NN']
                pos_nnp = ""
                if 'POS_NNP' in itemKeys:
                    pos_nnp = item['POS_NNP']
                lemma_verb = ""
                if 'Lemma_Verb' in itemKeys:
                    lemma_verb = item['Lemma_Verb']
                pos_verb = ""
                if 'POS_Verb' in itemKeys:
                    pos_verb = item['POS_Verb']
                #print(pos_nnp, pos_verb, pos_nn, lemma_verb)
                posLemmaVerbDict[pos_verb] = lemma_verb
                clusterVerb = entityClusterDict.get(pos_verb, None)
                clusterNN = entityClusterDict.get(pos_nn, None)
                clusterNNP = entityClusterDict.get(pos_nnp, None)
                #print(clusterNNP, clusterVerb, clusterNN)
                if clusterVerb:
                    relationTuple = clusterNNP + ':' + clusterNN
                    if clusterVerb in clusterVerbDict:
                        clusterVerbDict[clusterVerb].add(relationTuple)
                    else:
                        clusterVerbDict[clusterVerb] = set([relationTuple])
    print("Cluster Mapping Relation")
    pprint(clusterVerbDict)
    print()
    resultDict = {}
    for clusterVerbId in clusterVerbDict:
        #print(clusterVerbId)
        verbDetails = {}
        verbName = 'verb_' + clusterVerbId
        verbData = clustersDict[clusterVerbId]
        for mapping in clusterVerbDict[clusterVerbId]:
            clustersMapped = mapping.split(':')
            setA = 'set_A'
            setA_Index = clustersMapped[0]
            setA_Data = clustersDict[setA_Index]
            setB = 'set_B'
            setB_Index = clustersMapped[1]
            setB_Data = clustersDict[setB_Index]
            verbDetails['data'] = verbData
            verbDetails['lemma_data'] = getLemmaVerbData(
                posLemmaVerbDict, verbData)
            verbDetails[setA] = setA_Data
            verbDetails[setB] = setB_Data
            resultDict[verbName] = verbDetails
    pprint(resultDict)
    resultFilePath = "verbMapping/" + classifyVerbFileName
    with open(resultFilePath, 'w') as outFile:
        json.dump(resultDict, outFile, indent=4)
    print("verbMapping results found in {}".format(resultFilePath))

    annotateVerbs(resultDict, annotatedVerbFileName)
Пример #18
0
def constructor_graph(f):
    g = myGraph()
    actor_list = []
    actor_name = set()
    movie_list = []
    vertices = []
    edges = []
    items = {}
    total_edge = 0
    for item in json_lines.reader(f):
        # def add_vertex_to_graph(self, name, age, gross, date, page):
        if ('actor_name' in item):
            # print(item['actor_name'], item['actor_age'])
            actor_list.append(item)
            actor_name.add(item['actor_name'])
            g.add_vertex_to_graph(item['actor_name'], item['actor_age'], None,
                                  None, item['page'], False)
            actor_detail = "Actor: " + item['actor_name'] + "\nAge: " + str(
                item['actor_age'])
            items[item['actor_name']] = len(vertices)
            vertices.append(actor_detail)
        else:
            movie_list.append(item)
            g.add_vertex_to_graph(item['name'], None, item['gross'],
                                  item['date'], item['page'], True)
            movie_detail = "Movie: " + item['name'] + "\n Total Gross: " + str(
                item['gross'])
            items[item['name']] = len(vertices)
            vertices.append(movie_detail)

    for m in movie_list:
        gross = m['gross']
        movie_name = m['name']
        i = 1
        for actor in m['actors']:
            cur_name = actor[actor.rfind("/") + 1:].replace("_", " ")
            if cur_name not in actor_name:
                continue
            egde_weight = gross * (1 + i * 0.0001)
            i += 1
            g.add_edge(cur_name, movie_name, egde_weight)
            total_edge += 1
            edges.append((items[cur_name], items[movie_name]))

    global total_actor
    global total_movie
    total_actor = len(actor_list)
    total_movie = len(movie_list)

    print(total_edge)
    graph = Graph(vertex_attrs={"label": vertices},
                  edges=edges,
                  directed=False)
    Graph.write_svg(graph,
                    fname="graph_cache.svg",
                    labels='label',
                    colors="blue",
                    vertex_size=3,
                    edge_colors=["yellow"] * 1000,
                    font_size="4")
    return g
import json_lines
import tensorflow as tf
import gensim
import numpy as np
import jsonlines
import pickle
from random import shuffle

# data_file = "./resources/stub.jsonl"
data_file = "./resources/all.jsonl"
data = json_lines.reader(open(data_file))

vocab = dict()

max_len = 0
max_ind = 0
record = None
count = 0
tot = 0

for sample in data:
    sentence1 = sample.get("sentence1").strip(".").split()
    sentence2 = sample.get("sentence2").strip(".").split()
    tot = tot + len(sentence1) + len(sentence2)

    if len(sentence1) > max_len:
        max_len = len(sentence1)
        record = sentence1
    if len(sentence2) > max_len:
        max_len = len(sentence2)
        record = sentence2
Пример #20
0
import gensim
import numpy as np

# Defining constants
node_number = 32
batch_size = 1
embedding_dim = 300
class_num = 3

# Defining file names
train_data_file_name = "./resources/snli_1.0_dev.jsonl"
f = open(train_data_file_name)
embedding_file_name = "./resources/temp.bin"

# word_dict = gensim.models.KeyedVectors.load_word2vec_format(embedding_file_name, binary=True)
file = json_lines.reader(f)

content = []
for line in file:
    content.append(line)


def read_data(lines: list):
    labels = []
    inputs = []
    for line in lines:
        sentence1 = []
        sentence2 = []
        label_text = line.get("gold_label")
        if label_text == '-':
            continue
Пример #21
0
    # init db -> this means creating it from scratch
    if os.path.exists(master_kb_text_dir):
        os.remove(master_kb_text_dir)
    con = sq.connect(master_kb_text_dir)
    query = 'CREATE TABLE master_kb_text (idx real, authors text, document_nm text, paragraph text, url text, verse_references text)'
    con.execute(query)
    con.commit()

    # start process loop
    vectorised_master_kb = []
    k, idx = 0, 0
    for jsonl_kbs in jsonl_kbs_to_process:

        with open(jsonl_kbs, 'rb') as jsonl_kb_file:
            for json_line in json_lines.reader(jsonl_kb_file):

                # log
                k += 1
                if k % 10 == 0:
                    print(f"{datetime.datetime.now()} - {k}")

                if filter_jsonl(json_line):
                    # if True:
                    #       (1) encode the context and save numpy array
                    #       (2) save the paragraph in sqllite
                    # print("\n"+json_line['paragraph']+"\n")

                    # 1. encode context
                    vectorised_master_kb.append(
                        model.predict(json_line['paragraph'],
Пример #22
0
    def __init__(self, jsonl_path, mode=None):
        self.mode = mode
        self.raw = []
        self.lst = []
        self.refs = []
        if mode == 'test':
            lst = json.load(open(jsonl_path, 'r'))
            for item in lst:
                context = item['context']
                dialog = []
                for utts in context:
                    p = utts.find(':')
                    dialog.append(
                        ((utts[p - 1] == 'A') * 2 - 1, utts[p + 2:-1], 0))

                if dialog[0][1][-1] == '>':
                    dialog = dialog[1:]

                if len(dialog) == 0:
                    continue

                responses = []
                for resp in item['responses']:
                    #if resp[0] == ')':
                    #    resp = resp[2:]
                    responses.append(resp)

                spk = (item['speaker'] == 'A') * 2 - 1
                dialog.append((spk, responses[0], 0))
                responses = responses[1:]
                responses = [
                    ' '.join(WordPunctTokenizer().tokenize(resp))
                    for resp in responses
                ]

                if len(responses) == 0:
                    continue

                self.raw.append(dialog)
                self.lst.append((len(self.raw) - 1, 0, len(dialog)))
                self.refs.append(responses)

            return

        from collections import Counter
        self.ct = Counter()
        self.topics = []
        with open(jsonl_path, 'r') as f:
            for idx, item in enumerate(reader(f)):
                utts = item['utts']
                self.topics.append(item['topic'])
                self.raw.append([(int(speaker == 'A') * 2 - 1, sentence, _)
                                 for speaker, sentence, _ in utts])

                lst = [(idx, start, start + wnd_sz)
                       for start in range(0, len(utts)-wnd_sz)] + \
                      [(idx, 0, end)
                       for end in range(2, min(wnd_sz+1, len(utts)))]

                #for i, start, end in lst:
                #    spk_lst = self.raw[idx][end-2][0]
                #    spk_tgt = self.raw[idx][end-1][0]
                #
                #    self.lst.append((i, start, end))
                self.lst += lst

        self.refs = [['none']] * len(self.lst)
Пример #23
0
 def __init__(self, file_name):
     self.file = json_lines.reader(open(file_name))
def retweetnetwork(filename, 
                   giant_component=False, 
                   privacy=False,
                   aggregation=None,
                   t=0,
                   starttime=None,
                   endtime=None):
    """Generate Retweet Network from Twitter data collection.

    Parameters:
    filename: path to jsonl twitter object to transform
    giant_component (boolean): keep only largest weakly connected component 
    aggregation (str): aggregation method to use ('soft', 'hard', 'None')
    privacy: 
    t (int): threshold for hard aggregation
    
    Returns:
    igraph graph object: retweet network where a link is created
    from i to j if i retweeted j.
    """    
    with open(filename, 'rb') as f:    
        nodesdict = {}
        edgelist = []
        d3graph = {"nodes": [], "links": []}
        
        for tweet in (json_lines.reader(f)):
            if 'retweeted_status' in tweet:
                
                time = tweet["created_at"]
                time = datetime.strptime(time,'%a %b %d %X %z %Y')
                time_date = time.date()

                if starttime <= time_date <= endtime:

                    # retweeting node [source of retweet action]                
                    name = tweet["user"]["screen_name"]
                    try:
                        nodesdict[f"{name}"]["followers"] = tweet["user"]["followers_count"]
                        nodesdict[f"{name}"]["friends"] = tweet["user"]["friends_count"]
                    except KeyError:
                        nodesdict[f"{name}"] = {}
                        nodesdict[f"{name}"]["followers"] = tweet["user"]["followers_count"]
                        nodesdict[f"{name}"]["friends"] = tweet["user"]["friends_count"]
                    try:
                        nodesdict[f"{name}"]["tweets"].append(tweet["id_str"])
                    except KeyError:
                        nodesdict[f"{name}"]["tweets"] = []
                        nodesdict[f"{name}"]["tweets"].append(tweet["id_str"])
                    
                    # retweeted node [target of retweet action]
                    name = tweet['retweeted_status']["user"]["screen_name"]
                    try:
                        nodesdict[f"{name}"]["followers"] = tweet['retweeted_status']["user"]["followers_count"]
                        nodesdict[f"{name}"]["friends"] = tweet['retweeted_status']["user"]["friends_count"]
                    except KeyError:
                        nodesdict[f"{name}"] = {}
                        nodesdict[f"{name}"]["followers"] = tweet['retweeted_status']["user"]["followers_count"]
                        nodesdict[f"{name}"]["friends"] = tweet['retweeted_status']["user"]["followers_count"]
                    try:
                        nodesdict[f"{name}"]["tweets"].append(tweet['retweeted_status']["id_str"])
                    except KeyError:
                        nodesdict[f"{name}"]["tweets"] = []
                        nodesdict[f"{name}"]["tweets"].append(tweet['retweeted_status']["id_str"])
                                    
                    # links
                    source   = tweet["user"]["screen_name"]                
                    target   = tweet['retweeted_status']['user']['screen_name']
                    tweetid  = tweet["id_str"]
                    time_str = time.isoformat(timespec='seconds')
                    edgelist.append((source, target, tweetid, time_str))
                
            
    #print("Importing to igraph...")
    # import to igraph
    G = ig.Graph.DictList(edges=(dict(source=source, target=target, tweet=tweetid,time=time, weight=1) for source, target, tweet, time in edgelist), 
                          vertices=None, 
                          directed=True)
   
    # add node metadata
    for v in G.vs:
        name = v['name']
        v['followers'] = nodesdict[name]['followers']
        v['friends'] = nodesdict[name]['friends']
        v['tweets'] = list(set(nodesdict[name]['tweets']))
    
    #print("Running giant component and aggregations...")
    
    # giant_component == False and aggregation == None
    if giant_component == False and aggregation == None:
        pass
        
    # giant_component == True and aggregation == None
    elif giant_component == True and aggregation == None:
        G = G.components(mode="weak").giant()
    
    # giant_component == False and aggregation == 'hard'
    elif giant_component == False and aggregation == 'hard':
        todel = []
        for v in G.vs:
            if G.degree(v, mode="in") <= t:
                todel.append(v.index)
        #print("Deleting vertices")
        G.delete_vertices(todel)
        #G = G.components(mode="weak").giant()
    
    # giant_component == True and aggregation == 'hard'
    elif giant_component == True and aggregation == 'hard':
        todel = []
        for v in G.vs:
            if G.degree(v, mode="in") <= t:
                todel.append(v.index)
        #print("Deleting vertices")
        G.delete_vertices(todel)
        G = G.components(mode="weak").giant()
    
    # giant_component == False and aggregation == 'soft'
    elif giant_component == False and aggregation == 'soft':
        #G = G.components(mode="weak").giant()
        todel = []
        for v in G.vs:
            if G.degree(v, mode="in") == 0 and len(set(G.neighbors(v, mode="out"))) < 2:
                todel.append(v.index)
        #print("Deleting vertices")
        G.delete_vertices(todel)
        
            
    # giant_component == True and aggregation == 'soft'
    elif giant_component == True and aggregation == 'soft':
        G = G.components(mode="weak").giant()
        todel = []  
        for v in G.vs:
            if G.degree(v, mode="in") == 0 and len(set(G.neighbors(v, mode="out"))) < 2:
                todel.append(v.index)
        G.delete_vertices(todel)
    
    return G
Пример #25
0
def preprocess():
    ontology = {
        'domains': {},
        'intents': {},
        'binary_dialogue_act': [],
        'state': {}
    }

    def process_dialog(ori_dialog, split, dialog_id):
        domain = ori_dialog['domain']
        ontology['domains'][domain] = {
            'description': "",
            'slots': {}
        }
        dialog = {
            "dataset": dataset,
            "data_split": split,
            "dialogue_id": f'{dataset}_{dialog_id}',
            "original_id": ori_dialog['id'],
            "domains": [domain],
        }
        turns = []
        # starts with system
        for utt_idx, utt in enumerate(ori_dialog['turns'][1:]):
            turn = {
                'utt_idx': utt_idx,
                'utterance': utt,
                'dialogue_act': {
                    'categorical': [],
                    'non-categorical': [],
                    'binary': [],
                },
            }
            if utt_idx % 2 == 0:
                turn['speaker'] = 'user'
                turn['state'] = {}
                turn['state_update'] = {
                    'categorical': [],
                    'non-categorical': [],
                }
            else:
                turn['speaker'] = 'system'
            turns.append(turn)
        if turns[-1]['speaker'] == 'system':
            turns.pop()

        dialog['turns'] = turns
        return dialog

    dialog_id = 0
    data = []
    with ZipFile(os.path.join(origin_data_dir, 'metalwoz-v1.zip')) as zipfile:
        for path in zipfile.namelist():
            if path.startswith('dialogues'):
                for dialog in json_lines.reader(zipfile.open(path)):
                    data.append(process_dialog(dialog, 'train', dialog_id))
                    dialog_id += 1

    ZipFile(os.path.join(origin_data_dir, 'metalwoz-test-v1.zip')).extract('dstc8_metalwoz_heldout.zip')
    with ZipFile(os.path.join('dstc8_metalwoz_heldout.zip')) as zipfile:
        for path in zipfile.namelist():
            if path.startswith('dialogues'):
                for dialog in json_lines.reader(zipfile.open(path)):
                    data.append(process_dialog(dialog, 'test', dialog_id))
                    dialog_id += 1
    os.remove('dstc8_metalwoz_heldout.zip')

    json.dump(ontology, open(os.path.join(self_dir, 'ontology.json'), 'w'))
    json.dump(data, open('data.json', 'w'), indent=4)
    ZipFile(os.path.join(self_dir, 'data.zip'), 'w', ZIP_DEFLATED).write('data.json')
    os.remove('data.json')
Пример #26
0
            exit(0)

        output_file = str(sys.argv[1]) + '.jsonl'
        print('\n')  # start from next line

        with jsonlines.open(output_file, mode='w') as writer:

            print(parameters.file_open_msg.format(output_file))
            print('\n')  # start from next line

            for file_name in list_files:
                print(parameters.add_content_msg.format(file_name))
                total_count = 0

                with open(file_name, 'rb') as file:
                    for item in json_lines.reader(file):
                        total_count = total_count + 1

                print(parameters.file_len_msg.format(file_name, total_count))
                progress = 0

                with jsonlines.open(file_name) as reader:
                    for item in reader:
                        writer.write(item)
                        progress = progress + 1
                        print(parameters.file_read_prog.format(
                            progress, total_count),
                              end='\r')
                    reader.close()
                    print('\n')  # start from next line
Пример #27
0
def get_qa_info(difficulty, subset, special='', limit=0):
    limit_bool = True if np.bool(limit) else False
    if getpass.getuser() == 'Mitch':
        header = r'C:\Users\Mitch\PycharmProjects\ARC'
    else:
        header = '/home/kinne174/private/PythonProjects/ARC'

    if special == 'MOON':
        if getpass.getuser() == 'Mitch':
            MOON_filename = os.path.join(header,
                                         r'visualization\moon_questions.json')
        else:
            MOON_filename = '/home/kinne174/private/PythonProjects/JSM_2019/moon/moon_questions.json'
        MOON_allinfo = []
        MOON_document = namedtuple(
            'MOON_document', 'id question choices_text choices_labels answer')

        with open(MOON_filename, 'rb') as f:
            for dic in json_lines.reader(f):
                # only one dict for some reason
                for id, item in dic.items():
                    question = item['question']
                    choices_text = item['choices_text']
                    choices_labels = item['choices_labels']
                    answer = item['answer']

                    MOON_allinfo.append(
                        MOON_document(id, question, choices_text,
                                      choices_labels, answer))

        return MOON_allinfo

    if difficulty == 'EASY':
        if subset == 'TRAIN':
            EASY_TRAIN_filename = r'ARC-V1-Feb2018-2\ARC-Easy\ARC-Easy-Train.jsonl'
            EASY_TRAIN_allinfo = []
            EASY_TRAIN_document = namedtuple(
                'EASY_TRAIN_document',
                'id question choices_text choices_labels answer')

            with open(os.path.join(header, EASY_TRAIN_filename), 'rb') as f:
                for item_no, item in enumerate(json_lines.reader(f)):
                    id = item['id']
                    question = item['question']['stem']
                    choices_text = [
                        choice['text']
                        for choice in item['question']['choices']
                    ]
                    choices_labels = [
                        choice['label']
                        for choice in item['question']['choices']
                    ]
                    answer = item['answerKey']

                    EASY_TRAIN_allinfo.append(
                        EASY_TRAIN_document(id, question, choices_text,
                                            choices_labels, answer))

                    if limit_bool and item_no > limit:
                        break
            return EASY_TRAIN_allinfo
        elif subset == 'DEV':
            EASY_DEV_filename = r'ARC-V1-Feb2018-2\ARC-Easy\ARC-Easy-Dev.jsonl'
            EASY_DEV_allinfo = []
            EASY_DEV_document = namedtuple(
                'EASY_DEV_docuemnt',
                'id question choices_text choices_labels answer')

            with open(os.path.join(header, EASY_DEV_filename), 'rb') as f:
                for item_no, item in enumerate(json_lines.reader(f)):
                    id = item['id']
                    question = item['question']['stem']
                    choices_text = [
                        choice['text']
                        for choice in item['question']['choices']
                    ]
                    choices_labels = [
                        choice['label']
                        for choice in item['question']['choices']
                    ]
                    answer = item['answerKey']

                    EASY_DEV_allinfo.append(
                        EASY_DEV_document(id, question, choices_text,
                                          choices_labels, answer))

                    if limit_bool and item_no > limit:
                        break
            return EASY_DEV_allinfo
        else:
            EASY_TEST_filename = r'ARC-V1-Feb2018-2\ARC-Easy\ARC-Easy-Test.jsonl'
            EASY_TEST_allinfo = []
            EASY_TEST_document = namedtuple(
                'EASY_TEST_allinfo',
                'id question choices_text choices_labels answer')

            with open(os.path.join(header, EASY_TEST_filename), 'rb') as f:
                for item_no, item in enumerate(json_lines.reader(f)):
                    id = item['id']
                    question = item['question']['stem']
                    choices_text = [
                        choice['text']
                        for choice in item['question']['choices']
                    ]
                    choices_labels = [
                        choice['label']
                        for choice in item['question']['choices']
                    ]
                    answer = item['answerKey']

                    EASY_TEST_allinfo.append(
                        EASY_TEST_document(id, question, choices_text,
                                           choices_labels, answer))

                    if limit_bool and item_no > limit:
                        break
            return EASY_TEST_allinfo
    else:
        if subset == 'TRAIN':
            CHALLENGE_TRAIN_filename = r'ARC-V1-Feb2018-2\ARC-Challenge\ARC-Challenge-Train.jsonl'
            CHALLENGE_TRAIN_allinfo = []
            CHALLENGE_TRAIN_document = namedtuple(
                'CHALLENGE_TRAIN_allinfo',
                'id question choices_text choices_labels answer')

            with open(os.path.join(header, CHALLENGE_TRAIN_filename),
                      'rb') as f:
                for item_no, item in enumerate(json_lines.reader(f)):
                    id = item['id']
                    question = item['question']['stem']
                    choices_text = [
                        choice['text']
                        for choice in item['question']['choices']
                    ]
                    choices_labels = [
                        choice['label']
                        for choice in item['question']['choices']
                    ]
                    answer = item['answerKey']

                    CHALLENGE_TRAIN_allinfo.append(
                        CHALLENGE_TRAIN_document(id, question, choices_text,
                                                 choices_labels, answer))

                    if limit_bool and item_no > limit:
                        break
            return CHALLENGE_TRAIN_allinfo
        elif subset == 'DEV':
            CHALLENGE_DEV_filename = r'ARC-V1-Feb2018-2\ARC-Challenge\ARC-Challenge-Dev.jsonl'
            CHALLENGE_DEV_allinfo = []
            CHALLENGE_DEV_document = namedtuple(
                'CHALLENGE_DEV_allinfo',
                'id question choices_text choices_labels answer')

            with open(os.path.join(header, CHALLENGE_DEV_filename), 'rb') as f:
                for item_no, item in enumerate(json_lines.reader(f)):
                    id = item['id']
                    question = item['question']['stem']
                    choices_text = [
                        choice['text']
                        for choice in item['question']['choices']
                    ]
                    choices_labels = [
                        choice['label']
                        for choice in item['question']['choices']
                    ]
                    answer = item['answerKey']

                    CHALLENGE_DEV_allinfo.append(
                        CHALLENGE_DEV_document(id, question, choices_text,
                                               choices_labels, answer))

                    if limit_bool and item_no > limit:
                        break
            return CHALLENGE_DEV_allinfo
        else:
            CHALLENGE_TEST_filename = r'ARC-V1-Feb2018-2\ARC-Challenge\ARC-Challenge-Test.jsonl'
            CHALLENGE_TEST_allinfo = []
            CHALLENGE_TEST_document = namedtuple(
                'CHALLENGE_TEST_allinfo',
                'id question choices_text choices_labels answer')

            with open(os.path.join(header, CHALLENGE_TEST_filename),
                      'rb') as f:
                for item_no, item in enumerate(json_lines.reader(f)):
                    id = item['id']
                    question = item['question']['stem']
                    choices_text = [
                        choice['text']
                        for choice in item['question']['choices']
                    ]
                    choices_labels = [
                        choice['label']
                        for choice in item['question']['choices']
                    ]
                    answer = item['answerKey']

                    CHALLENGE_TEST_allinfo.append(
                        CHALLENGE_TEST_document(id, question, choices_text,
                                                choices_labels, answer))

                    if limit_bool and item_no > limit:
                        break
            return CHALLENGE_TEST_allinfo
Пример #28
0
import json_lines
import time

with open('nguoiduatin_phapluat.json', 'rb') as f:
    for item in json_lines.reader(f):
        print(item['content'])
        time.sleep(5)


Пример #29
0
def read_jsonl_file(jsonl_fn):
    with open(jsonl_fn, 'rb') as f:
        for item in json_lines.reader(f):
            yield item
Пример #30
0
import pandas as pd
import json_lines as jl
from sys import argv
import logging

if len(argv) < 3:
    logging.critical(
        'Not enough parameters passed. Run script as:\npython postprocess.py [input-file] [output-directory]'
    )
    exit(1)

parcel_items = []
building_items = []

with open('../scraper/parceldata.jl', 'rb') as f:
    for item in jl.reader(f):
        if not item:
            #pass over empty dictionary
            continue
        elif 'property_address' in item:
            pass
        elif 'use_code' in item:
            pass
        elif 'millage_rate' in item:
            pass
        elif 'owner' in item:
            pass
Пример #31
0
    temp_word = f.read()
    temp_word = re.sub('##', '', temp_word)
    print(temp_word)
    vocab = temp_word.split()

#print(vocab)

# load the NER tagger
tagger = SequenceTagger.load('ner')

path = r'C:\Users\Luca\Desktop\Current Projects\DMT HW 3\DataSet'

new_data = []
with open(path + '\paper_dev.jsonl',
          'rb') as f:  # opening file in binary(rb) mode
    for index, item in enumerate(json_lines.reader(f)):
        # discard all the label = Not enough info
        if item['label'] == 'NOT ENOUGH INFO':
            continue
        print('Index--->', index)
        print()
        # We are not interested in the evidence key
        del item['evidence']
        # run NER over sentence
        sentence = Sentence(copy.deepcopy(item['claim']))
        tagger.predict(sentence)

        # We are only interested in single labels for the whole claim
        sentence = sentence.to_dict(tag_type='ner')
        sentence['labels']