예제 #1
0
def read_conll03_en(path: str):
    dataset_name = "conll03-en"
    data = dict()
    for dataset_file in ["train.txt", "dev.txt", "test.txt"]:
        file = os.path.join(path, dataset_name, dataset_file)
        data_dict = read_ner_file(file, sep=" ")
        tagged_seqs = [
            [(tok, tag) for tok, tag in zip(d["text"].split(" "), d["ner_label"])]
            for d in data_dict
        ]
        split_name = dataset_file.split(".")[0]
        prepro_tagseqs = [preprocess_sequence(ts) for ts in tagged_seqs]
        data[split_name] = prepro_tagseqs

    return TaggedSeqsDataSet(**data)
예제 #2
0
 def file_to_dicts(self, file: str) -> [dict]:
     dicts = read_ner_file(filename=file, sep=self.delimiter)
     return dicts
예제 #3
0
파일: processor.py 프로젝트: theintz/FARM
 def _file_to_dicts(self, file: str) -> [dict]:
     dicts = read_ner_file(filename=file)
     return dicts