Exemplo n.º 1
0
    def preprocessing(self):
        filenames = os.listdir(self.prefix)
        images = os.listdir(self.img_path)

        for file in filenames:
            with open(os.path.join(self.prefix, file), 'r') as f:
                for item in jsonlines.Reader(f):
                    if item['file_name'] in images:
                        self.valid_files_num += 1
                        self.items.append(item)
                        for sentence in item['annotations']:
                            for character in sentence:
                                self.classes.add(character['text'])
                    else:
                        print("Invalid:", item['file_name'])
        self.classes_num = len(self.classes) + 1  # TODO
        self.classes = list(self.classes)
        self.classes.append('ignore')
Exemplo n.º 2
0
def load_vector_map(file_path):
    entity_vector = {}
    width = 0
    with open(file_path, "r", encoding="utf-8") as f:
        for item in jsonlines.Reader(f):
            words = ""
            vector = []
            for feature in item["features"]:
                if feature["token"] == "[CLS]" or feature["token"] == "[SEP]":
                    continue
                else:
                    words += feature["token"]
                    vector.append(feature["layers"][0]["values"])
                    width = len(feature["layers"][0]["values"])
            entity_vector[words] = np.mean(vector, axis=0)

    f.close()
    return entity_vector, width
Exemplo n.º 3
0
def read_jsonl_gz(path):
    with jsonlines.Reader(gzip.open(path)) as reader:
        raw_politician_tweets = list(reader)

    tweet_df = pd.DataFrame(
        data={
            'tweet': [t['full_text'] for t in raw_politician_tweets],
            'author':
            [t['user']['screen_name'] for t in raw_politician_tweets],
            'date': [
                parse_twitter_datetime(t['created_at'])
                for t in raw_politician_tweets
            ],
            'id': [t['id'] for t in raw_politician_tweets]
        })
    tweet_df.set_index('id')

    return tweet_df
Exemplo n.º 4
0
def main(args):
    ## dummy line for dependency file testing
    lines = ['1', '2', '3']
    reader = jsonlines.Reader(lines)
    print('-----------requirment.txt 테스트 start----------------')
    for obj in reader:
        print(obj, "\n")
    print('------------requirment.txt 테스트 end -----------------')
    ##########################################

    logging.info("getting data")
    train_dataset = train_input_fn()
    eval_dataset = eval_input_fn()
    validation_dataset = validation_input_fn()

    logging.info("configuring model")
    model = keras_model_fn(args.learning_rate, args.weight_decay,
                           args.optimizer, args.momentum)
    callbacks = []

    #callbacks.append(ModelCheckpoint(args.model_dir + '/checkpoint-{epoch}.h5'))
    callbacks.append(
        ModelCheckpoint(args.model_output_dir + '/checkpoint-{epoch}.h5'))
    logging.info("Starting training")
    model.fit(train_dataset,
              steps_per_epoch=(num_examples_per_epoch('train') //
                               args.batch_size),
              epochs=args.epochs,
              validation_data=validation_dataset,
              validation_steps=(num_examples_per_epoch('validation') //
                                args.batch_size),
              callbacks=callbacks)

    score = model.evaluate(eval_dataset,
                           steps=num_examples_per_epoch('eval') //
                           args.batch_size,
                           verbose=0)

    logging.info('Test loss:{}'.format(score[0]))
    logging.info('Test accuracy:{}'.format(score[1]))

    #return save_model(model, args.model_dir)

    return save_model(model, args.model_output_dir)
Exemplo n.º 5
0
    def detect_face(self, image_list_file, image_with_landmark_list_file):
        # detect
        frames = []
        landmarks = []
        probs = []
        boxes = []
        image_infos = []
        with open(image_list_file) as fin:
            for image_info in tqdm(jsonlines.Reader(fin)):
                image_infos.append(image_info)
        for image_info in tqdm(image_infos):
            frame = cv2.imread(image_info["file_path"])
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frames.append(frame)

            # When batch is full, detect faces and reset batch list
            if len(frames) >= self.batch_size:
                batch_boxes, batch_probs, batch_landmarks = self.mtcnn.detect(frames, landmarks=True)
                probs.extend(batch_probs)
                landmarks.extend(batch_landmarks)
                boxes.extend(batch_boxes)
                frames = []

        if len(frames):
            batch_boxes, batch_probs, batch_landmarks = self.mtcnn.detect(frames, landmarks=True)
            probs.extend(batch_probs)
            landmarks.extend(batch_landmarks)
            boxes.extend(batch_boxes)

        # save
        with jsonlines.open(image_with_landmark_list_file, 'w') as fout:
            for image_info, prob, box, landmark in zip(image_infos, probs, boxes, landmarks):
                fail_to_detect = landmark is None
                try:
                    fout.write({"file_path": image_info["file_path"],
                                "identity_name": image_info["identity_name"],
                                "identity_id": image_info["identity_id"],
                                "prob": 'none' if fail_to_detect else prob[0].tolist(),
                                "box": 'none' if fail_to_detect else box[0].tolist(),
                                "landmark": 'none' if fail_to_detect else landmark[0].tolist()
                                })
                except:
                    print(type(prob), prob, image_info["file_path"])
Exemplo n.º 6
0
    def execute(self, cmdopts):
        cmd = [APISH.BIN] + cmdopts
        self.log.info("APISH CALL: %s" % ' '.join(cmd))

        proc = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        try:
            c_out, c_err = proc.communicate(
                timeout=self.opts_driver['timeout'])

        except subprocess.TimeoutExpired:
            proc.kill()
            c_out, c_err = proc.communicate()

        if proc.returncode:
            raise RuntimeError('FAIL: %s' % ' '.join(cmd), c_out, c_err)

        return list(jsonlines.Reader(BytesIO(c_out)))
    def _load_jsonlines(self, dataset):
        """
        Function to encapsulate a dataset into a jsonlines Reader object.

        Args:
            dataset (HTTPResponse): HTTPResponse encapsulating a dataset
        Returns:
            jsonlines.Reader: JSON Lines Reader object wrapping a HTTPResponse object
                              containing the full dataset

        Raises:
            TypeError: Invalid data was supplied to the function
        """
        try:
            reader = jsonlines.Reader(dataset)
        except TypeError:
            _logger.error("Invalid data supplied to function")
            raise
        return reader
Exemplo n.º 8
0
def deal_dict(file1, file2):
    f1 = open(file1, 'r', encoding='utf-8')
    f2 = open(file2, 'w', encoding='utf-8')
    items = jsonlines.Reader(f1)  # 按行读入文件
    final = []
    for item in items:
        temp = []
        str1 = item["more"]
        for idx, ch in enumerate(str1):
            if ch == "【":
                temp.append(str1[idx + 1])
        temp = set(temp)
        new_list = [i for i in temp]
        item["more"] = new_list
        final.append(item)
    json.dump(final, f2, ensure_ascii=False, indent=1)
    # print(final)
    f1.close()
    f2.close()
Exemplo n.º 9
0
def read_webhose_dataset(dataset_file, num_docs=None):
    # webhose datasets: https://webhose.io/free-datasets/
    docs = []
    dataset_path = join('datasets', dataset_file)

    # read the zip file containing the webhose dataset
    with zipfile.ZipFile(dataset_path) as dataset_zipfile:
        # find the name of the JSON file inside the zip file
        json_filename = dataset_zipfile.namelist()[0]

        with dataset_zipfile.open(json_filename) as json_file:
            # each line in the file is a JSON document
            json_reader = jsonlines.Reader(json_file)

            # read only the initial num_docs documents
            for doc in itertools.islice(json_reader, num_docs):
                docs.append(doc['text'])

    return docs
Exemplo n.º 10
0
def parse_nodes(read_file):
    data = []
    with open(read_file, 'r', encoding='utf-8') as f:
        for item in jsonlines.Reader(f):
            if 'en' not in item['labels'] or len(item['labels']['en']) == 0:
                labels = ['None']
            else:
                labels = item['labels']['en']

            if 'en' not in item['descriptions'] or len(
                    item['descriptions']['en']) == 0:
                desc = 'None'
            else:
                desc = item['descriptions']['en'][0]

            qnode = item['qnode']
            props = item['wd_prop_vals']
            data.append([qnode, desc, labels, props])
    return data
Exemplo n.º 11
0
def save(c):
    """Save node ids for current docker context into environment context.json"""
    try:
        ec = EnvironmentContext(Path("."))
    except ValueError as exc:
        print(exc)
        sys.exit(1)

    cmd = "docker node ls --format '{{json .}}' "
    res = c.run(cmd, hide=True, warn=True)
    if res.exited != 0:
        print("WARNING: Current Docker Host is not in Swarm mode!!!")
        return
    nodes = list(jsonlines.Reader(res.stdout.splitlines()))
    nodes = {node["ID"]: node for node in nodes}
    context_json_fname = "context.json"
    with (ec.env_dir / context_json_fname).open("w", encoding="utf-8") as f:
        json.dump(nodes, f)
    print(f"{context_json_fname} updated.")
Exemplo n.º 12
0
    def read_sampling_data(self):
        """
        {"tp":"ar","cid":"diekwochen-1848-02-12-a-i0004","len":3899,"orig_lg":"lb","langdetect":[{"lang":"fr","prob":1}],"langid":[{"lang":"fr","prob":1}]}
        """
        json_reader = jsonlines.Reader(sys.stdin)
        for jdata in json_reader:
            m = re.search(
                r"^(?P<COLLECTION>.+)-(?P<YEAR>\d{4})-(?P<MONTH>\d{2})-(?P<DAY>\d{2})-(?P<EDITION>[a-z])-i(?P<CONTENTITEM>\d{4})$",
                jdata["id"],
            )
            if m:
                self.ids_per_coll_year[(m["COLLECTION"],
                                        m["YEAR"])].append(jdata["id"])
                self.id2data[jdata["id"]] = jdata
            else:
                log.error(f'NO MATCH FOR CONTENTITEM {jdata["id"]}')

            for k in self.ids_per_coll_year:
                self.ids_per_coll_year[k].sort()
Exemplo n.º 13
0
    def request_export(self, method, url=None, path=None, params=None, json=None, **kwargs):
        if not self.__verified:
            self.__verified = self.check_access()

        if url and path:
            url = '{}/{}'.format(url, path)
        elif path and not url:
            url = 'https://data.mixpanel.com/api/2.0/{}'.format(path)

        if 'endpoint' in kwargs:
            endpoint = kwargs['endpoint']
            del kwargs['endpoint']
        else:
            endpoint = 'export'

        if 'headers' not in kwargs:
            kwargs['headers'] = {}

        kwargs['headers']['Accept'] = 'application/json'

        if self.__user_agent:
            kwargs['headers']['User-Agent'] = self.__user_agent

        if method == 'POST':
            kwargs['headers']['Content-Type'] = 'application/json'

        kwargs['headers']['Authorization'] = 'Basic {}'.format(
            str(base64.urlsafe_b64encode(self.__api_secret.encode("utf-8")), "utf-8"))
        with metrics.http_request_timer(endpoint) as timer:
            response = self.perform_request(method=method,
                                        url=url,
                                        params=params,
                                        json=json,
                                        stream=True,
                                        **kwargs)
            timer.tags[metrics.Tag.http_status_code] = response.status_code

            # export endpoint returns jsonl results;
            #  other endpoints return json with array of results
            #  jsonlines reference: https://jsonlines.readthedocs.io/en/latest/
            reader = jsonlines.Reader(response.iter_lines())
            for record in reader.iter(allow_none=True, skip_empty=True):
                yield record
Exemplo n.º 14
0
def fetch_s2orc_keys_from_scirex_ids(scirex_doc_ids, data_download_commands):
    scirex_s2orc_mapping_file = os.path.join(caches_directory, "s2orc_hash_to_struct_mapping.pkl")
    if os.path.exists(scirex_s2orc_mapping_file):
        s2orc_hash_to_struct_mapping = pickle.load(open(scirex_s2orc_mapping_file, 'rb'))
    else:
        # If we don't have a cache file already, then manually match s2orc and scirex entries
        # (takes several hours and requires downloading and purging hundreds of GB of data)
        s2orc_hash_to_struct_mapping = {}
        start = time.perf_counter()
        for i, s2orc_shard_command in enumerate(data_download_commands):
            output_path = f"s2orc_downloads/{s2orc_shard_command[2]}"
            data_url = eval(s2orc_shard_command[3])
            shard_id = get_shard_id_from_path(output_path, data_type="full_text")
            #
            wget.download(data_url, out=output_path)
            #
            end = time.perf_counter()
            print(f"Took {end - start} seconds to download shard {i}\n")
            start = end
            #
            shard = gzip.open(output_path, 'rt')
            s2orc_full = jsonlines.Reader(shard)

            hits = 0
            for doc in s2orc_full:
                doc_hash = doc["_pdf_hash"]
                if doc['_pdf_hash'] in scirex_doc_ids:
                    doc_id = doc["paper_id"]
                    s2orc_hash_to_struct_mapping[doc_hash] = S2OrcEntry(shard_id, doc_id, doc_hash)
                    hits += 1
            #
            print(f"{hits} matching documents found!")

            end = time.perf_counter()
            print(f"Took {end - start} seconds to process pdf parses")
            start = end
            #
            if os.path.exists(output_path):
                os.remove(output_path)
            print(f"Deleted {output_path}")
            print("\n")
        pickle.dump(s2orc_hash_to_struct_mapping, open("s2orc_hash_to_struct_mapping.pkl", 'wb'))
    return s2orc_hash_to_struct_mapping
Exemplo n.º 15
0
def init_data(request):

    with open(MOVIE_PATH) as f:
        for line in jsonlines.Reader(f):
            name = line['name']
            score = float(line['score']) if line['score'] else 0.0
            director = ','.join(line['director'])
            actor = ','.join(line['actor'])
            area = line['area']
            length = line['length']
            # print(length)
            length = int(length.split('分钟')[0]) if length else 0
            brief = line['brief']
            img = IMG_PATH + name + '.jpg'
            release_date = line['release_date'].split('(')[0]
            release_date = datetime.date(*map(int, release_date.split('-')))
            types = line['type']

            if not Play.objects.filter(name=name):

                play = Play(name=name,
                            play_time=release_date,
                            length=length,
                            desc=brief,
                            director=director,
                            actors=actor,
                            area=area,
                            img=img,
                            score=score)
                play.save()
                for type_name in types:
                    # print(type_name)
                    play_type = Type.objects.filter(name=type_name).first()
                    # print(play_type)
                    if play_type:
                        play.types.add(play_type.id)
                    else:
                        type_new = Type(name=type_name)
                        type_new.save()
                        play.types.add(type_new.id)
                play.save()
    return redirect(reverse('play_list'))
Exemplo n.º 16
0
def read_data(file_path, labels_path, data_num):
    # 读取数据
    all_data, labels_data = [], []
    with open(file_path, 'r+', encoding='utf-8') as f:
        n = 0
        for item in jsonlines.Reader(f):
            n += 1
            if n <= data_num:
                pre_text = item['ctx']
                end_text = item['ending_options']
                all_data.append([pre_text, end_text[0], end_text[1], end_text[2], end_text[3]])

    with open(labels_path) as f:
        answer = [i.strip() for i in f.readlines()]
        m = 0
        for j in answer:
            m += 1
            if m <= data_num:
                labels_data.append(int(j))
    return all_data, labels_data
Exemplo n.º 17
0
def open_dataset() -> dict:
    """
    打开脚本文件
    :return: dict response 内容
    """
    if request.method == "GET":
        file_path = request.args.get("path", "")

        if not os.path.exists(file_path):
            return {"code": 0, "msg": "打开文件失败!", "data": {"msg": "当前路径不存在文件,请检查文件路径!"}}

        try:
            print(file_path)
            with open(file_path, "r", encoding="utf-8") as file:
                dataset = [line for line in jsonlines.Reader(file)]
                data_handler = SequenceHandler()
                data_handler.set_dataset(dataset)
                data_handler.file_path = file_path

            uid = str(uuid4())
            dataset_dict[uid] = data_handler

            feedback = {
                "code": 1,
                "msg": "打开文件成功!",
                "data": {
                    "length": len(dataset),
                    "uid": uid
                }
            }
        except Exception as error:
            feedback = {
                "code": 0,
                "msg": "打开文件失败!",
                "data": {
                    "msg": str(error)
                }
            }
    else:
        feedback = {"code": 0, "msg": "请求方法有误!", "data": {}}
    return feedback
Exemplo n.º 18
0
    def load_label(
        path: str, flavor: str, small_data: bool = False
    ) -> List[List[List[List[str]]]]:
        label_path = path.replace("docs", flavor)
        print(f"reading labels from {label_path}")
        labels = defaultdict(dict)
        evidences = defaultdict(list)
        label_toi = {"entailment": 0, "contradiction": 1, "neutral": 2}

        reader = jsonlines.Reader(open(label_path))
        for line in reader:
            label = line["classification"]
            idx = line["annotation_id"]
            labels[idx] = label_toi[label]
            evidences[label + "_hypothesis"] = []
            evidences[label + "_premise"] = []
            for evi in line["evidences"][0]:
                evidences[evi["docid"]].append((evi["start_token"], evi["end_token"]))
            if small_data and len(labels) > 1500:
                break
        return labels, evidences
Exemplo n.º 19
0
    def read_dev_data(self):
        with open('../hellaswag-train-dev/valid.jsonl', 'r+',
                  encoding='utf-8') as f:
            for item in jsonlines.Reader(f):
                pre_text = item['ctx']
                end_text = item['ending_options']
                self.test_text_left.append(pre_text)
                self.test_text_left.append(pre_text)
                self.test_text_left.append(pre_text)
                self.test_text_left.append(pre_text)
                self.test_text_right.append(end_text[0])
                self.test_text_right.append(end_text[1])
                self.test_text_right.append(end_text[2])
                self.test_text_right.append(end_text[3])

        with open('../hellaswag-train-dev/valid-labels.lst') as f:
            answer = [i.strip() for i in f.readlines()]
            for j in answer:
                one_line_label = [0, 0, 0, 0]
                one_line_label[int(j)] = 1
                self.test_label += one_line_label
Exemplo n.º 20
0
def main(args):
    count = 0
    image_idx = 1
    cropped_image_dir = "/home1/sxy/datasets/face_recognition/CASIA-WebFace/cropped_images"
    with open(args.image_with_landmark_list_file) as fin, jsonlines.open(args.cropped_image_list_file, 'w') as fout:
        for image_info in jsonlines.Reader(fin):
            frame = cv2.imread(image_info["file_path"])
            if image_info["landmark"] == 'none':
                count += 1
                continue
            cropped_frame = align(frame, np.array(image_info["landmark"], dtype=np.float32))

            cropped_file_path = os.path.join(cropped_image_dir, f"{image_idx}.jpg")
            cv2.imwrite(cropped_file_path, cropped_frame)
            cropped_image_info = {"cropped_file_path": cropped_file_path}
            cropped_image_info.update(image_info)
            fout.write(cropped_image_info)
            image_idx += 1
            if image_idx % 10000 == 0:
                print(f"already processing {image_idx} images")
    print(count)  # 2471
Exemplo n.º 21
0
Arquivo: utils.py Projeto: DiwenLu/NLP
def load_personachat(data_dir='data'):
    import os
    import subprocess
    filename = os.path.join(data_dir, 'personachat_all_sentences_train.jsonl')
    if not os.path.exists(filename):
        os.makedirs(data_dir, exist_ok=True)
        url = "https://nyu.box.com/shared/static/q4nvswb0szelivhgyx87vd1056ttqfyi.jsonl"
        args = ['wget', '-O', filename, url]
        subprocess.call(args)

        url = "https://nyu.box.com/shared/static/8krcizo8sms1m0ppy7uiwfcx4a3l5nsq.jsonl"
        args = [
            'wget', '-O',
            os.path.join(data_dir, 'personachat_all_sentences_valid.jsonl'),
            url
        ]
        subprocess.call(args)

    raw_datasets = {}
    for name in ['train', 'valid']:
        raw_datasets[name] = [
            x['tokens'] for x in jsonlines.Reader(
                open(
                    os.path.join(data_dir,
                                 'personachat_all_sentences_%s.jsonl' % name)))
        ]

    if os.path.exists(os.path.join(data_dir, 'vocab.pkl')):
        vocab = pickle.load(open(os.path.join(data_dir, 'vocab.pkl'), 'rb'))
    else:
        vocab = Dictionary(raw_datasets, include_valid=False)
        pickle.dump(vocab, open(os.path.join(data_dir, 'vocab.pkl'), 'wb'))

    tokenized_datasets = tokenize_dataset(raw_datasets, vocab)
    datasets = {
        name: SequenceDataset(ds)
        for name, ds in tokenized_datasets.items()
    }
    print("Vocab size: %d" % (len(vocab)))
    return raw_datasets, datasets, vocab
def parseJson(lang, total=False):
    ####
    #后期修改,直接从数据库读取
    ####
    dataset = []
    time_v = []
    with open(filename, 'r+', encoding='utf8') as f:
        for item in jsonlines.Reader(f):
            l = ''
            n = 0
            if (total):
                n = float(item['repo_num'])
            else:
                for i in range(0, 10):

                    if (item['n%dlang' % (i + 1)] == lang):
                        n = float(item['n%dnum' % (i + 1)])

            timestamp = datetime.datetime.strptime(item['timestamp'],
                                                   "%Y-%m-%dT%H:%M:%S")
            #
            # startday = datetime.datetime(2009,1,1,0,0)
            #
            # days=int((timestamp-startday).days)

            if (n != 0):
                dataset.append([n])
                time_v.append([timestamp, n])
    #time-value,后期修改,直接从数据库调用
    time_v = np.array(time_v)
    time_v = pd.DataFrame(time_v, columns=['timestamp', 'repo_number'])
    time_v.to_csv(path + '/datas/%sdata.csv' % lang, encoding='gb18030')

    dataset = np.array(dataset)
    trainSeq = dataset[0:int(len(dataset) * 0.8)]
    testSeq = dataset[int(len(dataset) * 0.8):len(dataset)]
    del time_v
    del f
    return trainSeq, testSeq, dataset
Exemplo n.º 23
0
def read_hellaswag_dev_data(text_path, labels_path, file_name):
    all_pre_text, all_endings, all_label = [], [], []
    with open(text_path, 'r+', encoding='utf-8') as f:
        for item in jsonlines.Reader(f):
            pre_text = item['ctx']
            end_text = item['ending_options']
            all_pre_text.append(pre_text)
            all_endings.append(end_text)

    with open(labels_path) as f:
        answer = [i.strip() for i in f.readlines()]
        for j in answer:
            all_label.append(int(j))

    print(len(all_pre_text), len(all_endings), len(all_label))

    wb = Workbook()
    ws = wb.active
    for i in range(len(all_label)):
        ws.append([all_pre_text[i], all_endings[i][0], all_endings[i][1], all_endings[i][2], all_endings[i][3],
                   all_label[i]])
    wb.save('data/' + file_name)
Exemplo n.º 24
0
def virus_report_for(path_to_zipfile):
    '''
    Return an object representing the data report.
    path_to_zipfile: The relative path to the zipfile containing the virus data report
    
    old code below:
    with zipfile.ZipFile(path_to_zipfile, 'r') as zip:
        virus_report_as_dict = yaml.safe_load(zip.read('ncbi_dataset/data/data_report.yaml'))
    virus_report = virus_report_pb2.VirusReport()
    ParseDict(virus_report_as_dict, virus_report)
    return virus_report
    '''

    genomesArray = []
    with zipfile.ZipFile(path_to_zipfile, 'r') as zip:
        report_file_handle = zip.open('ncbi_dataset/data/data_report.jsonl')
        reader = jsonlines.Reader(report_file_handle)
        for json_dict in reader.iter(type=dict, skip_invalid=True):
            # json_dict is a single report - all fields should be there.
            genomesArray.append(json_dict)
    virus_report = {"genomes": genomesArray}
    return virus_report
Exemplo n.º 25
0
def flatten_attrs(raw_file, flatten_dir, lang, attrs):
    def _get_file_info(filename):
        """get mode and file index from file name"""
        filename = os.path.split(filename)[-1]
        mode = filename[:str.rfind(filename, '.jsonl')]
        return mode

    _flatten_dir = os.path.expanduser(flatten_dir)
    mode = _get_file_info(raw_file)
    attr_writers = {}
    for attr in attrs:
        attr_file = os.path.join(_flatten_dir, lang,
                                 '{}.{}'.format(mode, attr))
        os.makedirs(os.path.dirname(attr_file), exist_ok=True)
        attr_writers[attr] = open(attr_file, 'w')

    with open(raw_file, 'r') as reader:
        for line in jsonlines.Reader(reader):
            for attr, info in line.items():
                if attr in attr_writers:
                    print(ujson.dumps(info, ensure_ascii=False),
                          file=attr_writers[attr])
    def process_file(self, input_file, top_k: int = 1000):
        data = []
        with open(input_file, 'r', encoding='utf-8') as f:
            for line in jsonlines.Reader(f):
                data.append(line)

        for article in tqdm(data):
            question = article['question']
            passage = article['passage']

            sentences = sentence_tokenizer.tokenize(passage)

            self.get_sim(sentences, question, 1)

        self.sort(top_k)

        for article in tqdm(data):
            article['sentence_id'] = self.evidence[0][0]
            self.evidence = self.evidence[1:]

        assert 'sentence_id' in data[0]
        return data
Exemplo n.º 27
0
    def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner='\n\n'):
        with open(file, 'rb') as fh:
            self.fh = fh
            cctx = zstandard.ZstdDecompressor()
            reader = io.BufferedReader(cctx.stream_reader(fh))
            rdr = jsonlines.Reader(reader)
            for ob in rdr:
                # naive jsonl where each object is just the string itself, with no meta. For legacy compatibility.
                if isinstance(ob, str):
                    assert not get_meta
                    yield ob
                    continue

                text = ob['text']

                if autojoin_paragraphs and isinstance(text, list):
                    text = para_joiner.join(text)

                if get_meta:
                    yield text, (ob['meta'] if 'meta' in ob else {})
                else:
                    yield text
Exemplo n.º 28
0
def process_jsonlines_hotpotqa(filename):
    """
    This is process_jsonlines method for intro-only processed_wikipedia file.
    The item example:
    {"id": "45668011", "url": "https://en.wikipedia.org/wiki?curid=45668011", "title": "Flouch Roundabout",
     "text": ["Flouch Roundabout is a roundabout near Penistone, South Yorkshire, England, where the A628 meets the A616."],
     "charoffset": [[[0, 6],...]]
     "text_with_links" : ["Flouch Roundabout is a roundabout near <a href=\"Penistone\">Penistone</a>,
     <a href=\"South%20Yorkshire\">South Yorkshire</a>, England, where the <a href=\"A628%20road\">A628</a>
     meets the <a href=\"A616%20road\">A616</a>."],
        "charoffset_with_links": [[[0, 6], ... [213, 214]]]}
    """
    # item should be nested list
    extracted_items = []
    # with jsonlines.open(filename) as reader:
    with bzopen(filename, "r") as bzfin:
        for obj in jsonlines.Reader(bzfin):
            wiki_id = obj["id"]
            title = obj["title"]
            title_id = make_wiki_id(title, 0)
            plain_text = "\t".join(obj["text"])
            text_with_links = "\t".join(obj["text_with_links"])

            hyper_linked_titles = []
            hyper_linked_titles = find_hyper_linked_titles(text_with_links)
            if len(hyper_linked_titles) > 0:
                hyper_linked_titles_text = "\t".join(hyper_linked_titles)
            else:
                hyper_linked_titles_text = ""
            extracted_items.append({
                "wiki_id": wiki_id,
                "title": title_id,
                "plain_text": plain_text,
                "hyper_linked_titles": hyper_linked_titles_text,
                "original_title": title
            })

    return extracted_items
Exemplo n.º 29
0
def filter_datasets(paths, out_path: str, require_fields=[]):
    logger.debug(f"Requiring fields {require_fields}")
    total_lines = 0
    examples = []
    for path in paths:
        full_path = pathlib.Path(path).resolve()
        f = gzip.open(full_path, "rb") if path.endswith(".jsonl.gz") else full_path.open("r")
        reader = jsonlines.Reader(f)

        logger.debug(f"Loading {full_path}")
        for json_dict in tqdm.tqdm(reader, desc=full_path.name):
            total_lines += 1
            # Check all required fields are present
            if any([field not in json_dict or not json_dict[field] for field in require_fields]):
                continue

            # We need the identifier (method name) as a label. Filter invalid identifiers
            if "identifier" in require_fields and _valid_identifier_regex.match(json_dict["identifier"]) == None:
                print(f"WARN: Invalid identifier {require_fields['identifier']}, skipping record")
                continue

            examples.append(json_dict)
            if total_lines % 100000 == 0:
                logger.debug(f"Filtered jsonl to {len(examples)}/{total_lines}")
        f.close()

        logger.debug(f"DONE: Filtered jsonl to {len(examples)}/{total_lines}")

    # TODO: Subsample

    # Write output
    full_out_path = pathlib.Path(out_path).resolve()
    f = gzip.open(full_out_path, "wb") if out_path.endswith(".jsonl.gz") else full_out_path.open("w")
    writer = jsonlines.Writer(f)
    logger.debug(f"Writing output to {full_out_path}...")
    writer.write_all(examples)
    logger.debug(f"DONE writing")
    f.close()
def read_data(f_in, f_out):
    out_file = open(os.path.join(root, f_out), 'w+', encoding='utf-8')
    label2id = {'neutral': 0, 'contradiction': 1, 'entailment': 2}
    writer = csv.writer(out_file)
    writer.writerow(('label', 'sentence1', 'sentence2'))
    with open(os.path.join(root, f_in),'r+',encoding='utf-8') as f:
        count = 0
        for item in jsonlines.Reader(f):
            label = item['gold_label']
            if label in label2id.keys():
                label = label2id[label]
            else:
                continue
            sen1 = item['sentence1']
            sen1 = process(sen1.lower())
            sen2 = item['sentence2']
            sen2 = process(sen2.lower())
            writer.writerow((label, sen1, sen2))
            count += 1
            print(count)
            if(count==150000):
                break
    out_file.close()