def read_conll03_en(path: str): dataset_name = "conll03-en" data = dict() for dataset_file in ["train.txt", "dev.txt", "test.txt"]: file = os.path.join(path, dataset_name, dataset_file) data_dict = read_ner_file(file, sep=" ") tagged_seqs = [ [(tok, tag) for tok, tag in zip(d["text"].split(" "), d["ner_label"])] for d in data_dict ] split_name = dataset_file.split(".")[0] prepro_tagseqs = [preprocess_sequence(ts) for ts in tagged_seqs] data[split_name] = prepro_tagseqs return TaggedSeqsDataSet(**data)
def file_to_dicts(self, file: str) -> [dict]: dicts = read_ner_file(filename=file, sep=self.delimiter) return dicts
def _file_to_dicts(self, file: str) -> [dict]: dicts = read_ner_file(filename=file) return dicts