def __init__(self, encoding_type: str = 'bioes'): """ :param str encoding_type: 支持bio和bioes格式 """ super().__init__() self._loader = ConllLoader(headers=['raw_chars', 'target'], indexes=[0, 1]) assert encoding_type in ('bio', 'bioes') self._tag_converters = [iob2] if encoding_type == 'bioes': self._tag_converters.append(iob2bioes)
def __init__(self, task: str = 'ner', encoding_type: str = 'bioes'): """ 加载Conll2003格式的英语语料,该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos 时,返回的DataSet中target取值于第2列; 当task为chunk时,返回的DataSet中target取值于第3列;当task为ner时,返回 的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略,这会导致数据的数量少于很多文献报道的值,但 鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号,并不应该作为预测对象,所以我们忽略了数据中的-DOCTSTART-开头的行 ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。 :param task: 指定需要标注任务。可选ner, pos, chunk """ assert task in ('ner', 'pos', 'chunk') index = {'ner': 3, 'pos': 1, 'chunk': 2}[task] self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index]) self._tag_converters = [] if task in ('ner', 'chunk'): self._tag_converters = [iob2] if encoding_type == 'bioes': self._tag_converters.append(iob2bioes)
def prepare_ptb(args): datas = {} datas["pos"] = (ConllLoader(headers=["words", "pos"], indexes=[0, 1]).load(args.pos).datasets) chunk_data = (ConllLoader(headers=["words", "chunk"], indexes=[0, 2]).load(args.chunk).datasets) chunk_data['train'], chunk_data['dev'] = chunk_data['train'].split(0.1) datas['chunk'] = chunk_data datas["ner"] = (ConllLoader(headers=["words", "ner"], indexes=[0, 3]).load(args.ner).datasets) for ds in datas['chunk'].values(): ds.apply_field(lambda x: iob2(x), 'chunk', 'chunk') for ds in datas['ner'].values(): ds.apply_field(lambda x: iob2bioes(iob2(x)), 'ner', 'ner') vocabs = {} src_vocab = Vocabulary() for idx, task_name in enumerate(["pos", "chunk", "ner"]): data = datas[task_name] filter_docstart(data) vocab = Vocabulary(padding=None, unknown=None) vocab.from_dataset(*list(data.values()), field_name=task_name) src_vocab.from_dataset(*list(data.values()), field_name="words") vocabs[task_name] = vocab task_lst = [] for idx, task_name in enumerate(["pos", "chunk", "ner"]): data = datas[task_name] src_vocab.index_dataset(*list(data.values()), field_name="words", new_field_name="words") vocabs[task_name].index_dataset(*list(data.values()), field_name=task_name, new_field_name=task_name) for ds in data.values(): ds.apply_field(len, 'words', 'seq_len') task_lst.append( Task(idx, task_name, data["train"], data["dev"], data["test"])) vocabs["words"] = src_vocab return task_lst, vocabs
def load(self, path: str) -> DataSet: """ 给定一个文件路径,读取数据。返回的DataSet包含以下的field raw_words: List[str] target: List[str] :param path: :return: """ dataset = ConllLoader(headers=['raw_words', 'target'], indexes=[3, 10]).load(path) def convert_to_bio(tags): bio_tags = [] flag = None for tag in tags: label = tag.strip("()*") if '(' in tag: bio_label = 'B-' + label flag = label elif flag: bio_label = 'I-' + flag else: bio_label = 'O' if ')' in tag: flag = None bio_tags.append(bio_label) return self.encoding_method(bio_tags) def convert_word(words): converted_words = [] for word in words: word = word.replace('/.', '.') # 有些结尾的.是/.形式的 if not word.startswith('-'): converted_words.append(word) continue # 以下是由于这些符号被转义了,再转回来 tfrs = { '-LRB-': '(', '-RRB-': ')', '-LSB-': '[', '-RSB-': ']', '-LCB-': '{', '-RCB-': '}' } if word in tfrs: converted_words.append(tfrs[word]) else: converted_words.append(word) return converted_words dataset.apply_field(convert_word, field_name='raw_words', new_field_name='raw_words') dataset.apply_field(convert_to_bio, field_name='target', new_field_name='target') return dataset
class Conll2003DataLoader(DataSetLoader): def __init__(self, task: str = 'ner', encoding_type: str = 'bioes'): """ 加载Conll2003格式的英语语料,该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos 时,返回的DataSet中target取值于第2列; 当task为chunk时,返回的DataSet中target取值于第3列;当task为ner时,返回 的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略,这会导致数据的数量少于很多文献报道的值,但 鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号,并不应该作为预测对象,所以我们忽略了数据中的-DOCTSTART-开头的行 ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。 :param task: 指定需要标注任务。可选ner, pos, chunk """ assert task in ('ner', 'pos', 'chunk') index = {'ner': 3, 'pos': 1, 'chunk': 2}[task] self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index]) self._tag_converters = [] if task in ('ner', 'chunk'): self._tag_converters = [iob2] if encoding_type == 'bioes': self._tag_converters.append(iob2bioes) def load(self, path: str): dataset = self._loader.load(path) def convert_tag_schema(tags): for converter in self._tag_converters: tags = converter(tags) return tags if self._tag_converters: dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET) return dataset def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt: VocabularyOption = None, lower: bool = False): """ 读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略 :param paths: :param word_vocab_opt: vocabulary的初始化值 :param lower: 是否将所有字母转为小写。 :return: """ # 读取数据 paths = check_dataloader_paths(paths) data = DataInfo() input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, path in paths.items(): dataset = self.load(path) dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) if lower: dataset.words.lower() data.datasets[name] = dataset # 对construct vocab word_vocab = Vocabulary( min_freq=2) if word_vocab_opt is None else Vocabulary( **word_vocab_opt) word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab # cap words cap_word_vocab = Vocabulary() cap_word_vocab.from_dataset( data.datasets['train'], field_name='raw_words', no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') input_fields.append('cap_words') data.vocabs['cap_words'] = cap_word_vocab # 对target建vocab target_vocab = Vocabulary(unknown=None, padding=None) target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) data.vocabs[Const.TARGET] = target_vocab for name, dataset in data.datasets.items(): dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) dataset.set_input(*input_fields) dataset.set_target(*target_fields) return data
def process_from_file(self, paths): paths = check_loader_paths(paths) loader = ConllLoader(headers=['raw_chars', 'target']) data_bundle = loader.load(paths) return self.process(data_bundle)
def __init__(self): self._loader = ConllLoader( headers=['words', 'pos_tags', 'heads', 'labels'], indexes=[1, 3, 6, 7])
class CTBxJointPipe(Pipe): """ 文件夹下应该具有以下的文件结构 -train.conllx -dev.conllx -test.conllx 每个文件中的内容如下(空格隔开不同的句子, 共有) 1 费孝通 _ NR NR _ 3 nsubjpass _ _ 2 被 _ SB SB _ 3 pass _ _ 3 授予 _ VV VV _ 0 root _ _ 4 麦格赛赛 _ NR NR _ 5 nn _ _ 5 奖 _ NN NN _ 3 dobj _ _ 1 新华社 _ NR NR _ 7 dep _ _ 2 马尼拉 _ NR NR _ 7 dep _ _ 3 8月 _ NT NT _ 7 dep _ _ 4 31日 _ NT NT _ 7 dep _ _ ... """ def __init__(self): self._loader = ConllLoader( headers=['words', 'pos_tags', 'heads', 'labels'], indexes=[1, 3, 6, 7]) def load(self, path: str): """ 给定一个文件路径,将数据读取为DataSet格式。DataSet中包含以下的内容 words: list[str] pos_tags: list[str] heads: list[int] labels: list[str] :param path: :return: """ dataset = self._loader._load(path) dataset.heads.int() return dataset def process_from_file(self, paths): """ :param paths: :return: Dataset包含以下的field chars: bigrams: trigrams: pre_chars: pre_bigrams: pre_trigrams: seg_targets: seg_masks: seq_lens: char_labels: char_heads: gold_word_pairs: seg_targets: seg_masks: char_labels: char_heads: pun_masks: gold_label_word_pairs: """ paths = check_loader_paths(paths) data = DataBundle() for name, path in paths.items(): dataset = self.load(path) data.datasets[name] = dataset char_labels_vocab = Vocabulary(padding=None, unknown=None) def process(dataset, char_label_vocab): dataset.apply(add_word_lst, new_field_name='word_lst') dataset.apply(lambda x: list(chain(*x['word_lst'])), new_field_name='chars') dataset.apply(add_bigram, field_name='chars', new_field_name='bigrams') dataset.apply(add_trigram, field_name='chars', new_field_name='trigrams') dataset.apply(add_char_heads, new_field_name='char_heads') dataset.apply(add_char_labels, new_field_name='char_labels') dataset.apply(add_segs, new_field_name='seg_targets') dataset.apply(add_mask, new_field_name='seg_masks') dataset.add_seq_len('chars', new_field_name='seq_lens') dataset.apply(add_pun_masks, new_field_name='pun_masks') if len(char_label_vocab.word_count) == 0: char_label_vocab.from_dataset(dataset, field_name='char_labels') char_label_vocab.index_dataset(dataset, field_name='char_labels') new_dataset = add_root(dataset) new_dataset.apply(add_word_pairs, new_field_name='gold_word_pairs', ignore_type=True) global add_label_word_pairs add_label_word_pairs = partial(add_label_word_pairs, label_vocab=char_label_vocab) new_dataset.apply(add_label_word_pairs, new_field_name='gold_label_word_pairs', ignore_type=True) new_dataset.set_pad_val('char_labels', -1) new_dataset.set_pad_val('char_heads', -1) return new_dataset for name in list(paths.keys()): dataset = data.datasets[name] dataset = process(dataset, char_labels_vocab) data.datasets[name] = dataset data.vocabs['char_labels'] = char_labels_vocab char_vocab = Vocabulary(min_freq=2).from_dataset( data.datasets['train'], field_name='chars', no_create_entry_dataset=[ data.get_dataset('dev'), data.get_dataset('test') ]) bigram_vocab = Vocabulary(min_freq=3).from_dataset( data.datasets['train'], field_name='bigrams', no_create_entry_dataset=[ data.get_dataset('dev'), data.get_dataset('test') ]) trigram_vocab = Vocabulary(min_freq=5).from_dataset( data.datasets['train'], field_name='trigrams', no_create_entry_dataset=[ data.get_dataset('dev'), data.get_dataset('test') ]) for name in ['chars', 'bigrams', 'trigrams']: vocab = Vocabulary().from_dataset(field_name=name, no_create_entry_dataset=list( data.datasets.values())) vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name='pre_' + name) data.vocabs['pre_{}'.format(name)] = vocab for name, vocab in zip(['chars', 'bigrams', 'trigrams'], [char_vocab, bigram_vocab, trigram_vocab]): vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name=name) data.vocabs[name] = vocab for name, dataset in data.datasets.items(): dataset.set_input('chars', 'bigrams', 'trigrams', 'seq_lens', 'char_labels', 'char_heads', 'pre_chars', 'pre_bigrams', 'pre_trigrams') dataset.set_target('gold_word_pairs', 'seq_lens', 'seg_targets', 'seg_masks', 'char_labels', 'char_heads', 'pun_masks', 'gold_label_word_pairs') return data
class ChineseNERLoader(DataSetLoader): """ 读取中文命名实体数据集,包括PeopleDaily, MSRA-NER, Weibo。数据在这里可以找到https://github.com/OYE93/Chinese-NLP-Corpus/tree/master/NER 请确保输入数据的格式如下, 共两列,第一列为字,第二列为标签,不同句子以空行隔开 我 O 们 O 变 O 而 O 以 O 书 O 会 O ... """ def __init__(self, encoding_type: str = 'bioes'): """ :param str encoding_type: 支持bio和bioes格式 """ super().__init__() self._loader = ConllLoader(headers=['raw_chars', 'target'], indexes=[0, 1]) assert encoding_type in ('bio', 'bioes') self._tag_converters = [iob2] if encoding_type == 'bioes': self._tag_converters.append(iob2bioes) def load(self, path: str): dataset = self._loader.load(path) def convert_tag_schema(tags): for converter in self._tag_converters: tags = converter(tags) return tags if self._tag_converters: dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET) return dataset def process(self, paths, bigrams=False, trigrams=False): """ :param paths: :param bool, bigrams: 是否包含生成bigram feature, [a, b, c, d] -> [ab, bc, cd, d<eos>] :param bool, trigrams: 是否包含trigram feature,[a, b, c, d] -> [abc, bcd, cd<eos>, d<eos><eos>] :return: DataBundle 包含以下的fields raw_chars: List[str] chars: List[int] seq_len: int, 字的长度 bigrams: List[int], optional trigrams: List[int], optional target: List[int] """ paths = check_dataloader_paths(paths) data = DataBundle() input_fields = [Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, path in paths.items(): dataset = self.load(path) if bigrams: dataset.apply_field(lambda raw_chars: [ c1 + c2 for c1, c2 in zip(raw_chars, raw_chars[1:] + ['<eos>']) ], field_name='raw_chars', new_field_name='bigrams') if trigrams: dataset.apply_field(lambda raw_chars: [ c1 + c2 + c3 for c1, c2, c3 in zip(raw_chars, raw_chars[1:] + ['<eos>'], raw_chars[2:] + ['<eos>'] * 2) ], field_name='raw_chars', new_field_name='trigrams') data.datasets[name] = dataset char_vocab = Vocabulary().from_dataset( data.datasets['train'], field_name='raw_chars', no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) char_vocab.index_dataset(*data.datasets.values(), field_name='raw_chars', new_field_name=Const.CHAR_INPUT) data.vocabs[Const.CHAR_INPUT] = char_vocab target_vocab = Vocabulary(unknown=None, padding=None).from_dataset( data.datasets['train'], field_name=Const.TARGET) target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) data.vocabs[Const.TARGET] = target_vocab if bigrams: bigram_vocab = Vocabulary().from_dataset( data.datasets['train'], field_name='bigrams', no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) bigram_vocab.index_dataset(*data.datasets.values(), field_name='bigrams', new_field_name='bigrams') data.vocabs['bigrams'] = bigram_vocab input_fields.append('bigrams') if trigrams: trigram_vocab = Vocabulary().from_dataset( data.datasets['train'], field_name='trigrams', no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) trigram_vocab.index_dataset(*data.datasets.values(), field_name='trigrams', new_field_name='trigrams') data.vocabs['trigrams'] = trigram_vocab input_fields.append('trigrams') for name, dataset in data.datasets.items(): dataset.add_seq_len(Const.CHAR_INPUT) dataset.set_input(*input_fields) dataset.set_target(*target_fields) return data