def process(self, paths, **kwargs): data_info = DataBundle() for name in ['train', 'test', 'dev']: data_info.datasets[name] = self.load(paths[name]) config = Config() vocab = Vocabulary().from_dataset(*data_info.datasets.values(), field_name='sentences') vocab.build_vocab() word2id = vocab.word2idx char_dict = preprocess.get_char_dict(config.char_path) data_info.vocabs = vocab genres = { g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"]) } for name, ds in data_info.datasets.items(): ds.apply( lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter), config.max_sentences, is_train=name == 'train')[0], new_field_name='doc_np') ds.apply( lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter), config.max_sentences, is_train=name == 'train')[1], new_field_name='char_index') ds.apply( lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter), config.max_sentences, is_train=name == 'train')[2], new_field_name='seq_len') ds.apply(lambda x: preprocess.speaker2numpy( x["speakers"], config.max_sentences, is_train=name == 'train'), new_field_name='speaker_ids_np') ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre') ds.set_ignore_type('clusters') ds.set_padder('clusters', None) ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len") ds.set_target("clusters") # train_dev, test = self.ds.split(348 / (2802 + 343 + 348), shuffle=False) # train, dev = train_dev.split(343 / (2802 + 343), shuffle=False) return data_info
class VocabProcessor(Processor): def __init__(self, field_name): super(VocabProcessor, self).__init__(field_name, None) self.vocab = Vocabulary() def process(self, *datasets): for dataset in datasets: assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) for ins in dataset: tokens = ins[self.field_name] self.vocab.update(tokens) def get_vocab(self): self.vocab.build_vocab() return self.vocab
class VocabProcessor(Processor): def __init__(self, field_name, min_freq=1, max_size=None): super(VocabProcessor, self).__init__(field_name, None) self.vocab = Vocabulary(min_freq=min_freq, max_size=max_size) def process(self, *datasets): for dataset in datasets: assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) dataset.apply(lambda ins: self.vocab.update(ins[self.field_name])) def get_vocab(self): self.vocab.build_vocab() return self.vocab def get_vocab_size(self): return len(self.vocab)
def process(self, data_bundle: DataBundle): r""" 对load进来的数据进一步处理原始数据包含:raw_key,raw_speaker,raw_words,raw_clusters .. csv-table:: :header: "raw_key", "raw_speaker","raw_words","raw_clusters" "bc/cctv/00/cctv_0000_0", "[[Speaker#1, Speaker#1],[]]","[['I','am'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" "bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" "[...]", "[...]","[...]","[...]" :param data_bundle: :return: """ genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])} vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name= Const.RAW_WORDS(3)) vocab.build_vocab() word2id = vocab.word2idx data_bundle.set_vocab(vocab, Const.INPUTS(0)) if self.config.char_path: char_dict = get_char_dict(self.config.char_path) else: char_set = set() for i,w in enumerate(word2id): if i < 2: continue for c in w: char_set.add(c) char_dict = collections.defaultdict(int) char_dict.update({c: i for i, c in enumerate(char_set)}) for name, ds in data_bundle.datasets.items(): # genre ds.apply(lambda x: genres[x[Const.RAW_WORDS(0)][:2]], new_field_name=Const.INPUTS(0)) # speaker_ids_np ds.apply(lambda x: speaker2numpy(x[Const.RAW_WORDS(1)], self.config.max_sentences, is_train=name == 'train'), new_field_name=Const.INPUTS(1)) # sentences ds.rename_field(Const.RAW_WORDS(3),Const.INPUTS(2)) # doc_np ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), self.config.max_sentences, is_train=name == 'train')[0], new_field_name=Const.INPUTS(3)) # char_index ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), self.config.max_sentences, is_train=name == 'train')[1], new_field_name=Const.CHAR_INPUT) # seq len ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), self.config.max_sentences, is_train=name == 'train')[2], new_field_name=Const.INPUT_LEN) # clusters ds.rename_field(Const.RAW_WORDS(2), Const.TARGET) ds.set_ignore_type(Const.TARGET) ds.set_padder(Const.TARGET, None) ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2), Const.INPUTS(3), Const.CHAR_INPUT, Const.INPUT_LEN) ds.set_target(Const.TARGET) return data_bundle
def process(self, paths, vocab_size, vocab_path, sent_max_len, doc_max_timesteps, domain=False, tag=False, load_vocab=True): """ :param paths: dict path for each dataset :param vocab_size: int max_size for vocab :param vocab_path: str vocab path :param sent_max_len: int max token number of the sentence :param doc_max_timesteps: int max sentence number of the document :param domain: bool build vocab for publication, use 'X' for unknown :param tag: bool build vocab for tag, use 'X' for unknown :param load_vocab: bool build vocab (False) or load vocab (True) :return: DataInfo datasets: dict keys correspond to the paths dict vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) embeddings: optional """ def _pad_sent(text_wd): pad_text_wd = [] for sent_wd in text_wd: if len(sent_wd) < sent_max_len: pad_num = sent_max_len - len(sent_wd) sent_wd.extend([WORD_PAD] * pad_num) else: sent_wd = sent_wd[:sent_max_len] pad_text_wd.append(sent_wd) return pad_text_wd def _token_mask(text_wd): token_mask_list = [] for sent_wd in text_wd: token_num = len(sent_wd) if token_num < sent_max_len: mask = [1] * token_num + [0] * (sent_max_len - token_num) else: mask = [1] * sent_max_len token_mask_list.append(mask) return token_mask_list def _pad_label(label): text_len = len(label) if text_len < doc_max_timesteps: pad_label = label + [0] * (doc_max_timesteps - text_len) else: pad_label = label[:doc_max_timesteps] return pad_label def _pad_doc(text_wd): text_len = len(text_wd) if text_len < doc_max_timesteps: padding = [WORD_PAD] * sent_max_len pad_text = text_wd + [padding] * (doc_max_timesteps - text_len) else: pad_text = text_wd[:doc_max_timesteps] return pad_text def _sent_mask(text_wd): text_len = len(text_wd) if text_len < doc_max_timesteps: sent_mask = [1] * text_len + [0] * (doc_max_timesteps - text_len) else: sent_mask = [1] * doc_max_timesteps return sent_mask datasets = {} train_ds = None for key, value in paths.items(): ds = self.load(value) # pad sent ds.apply(lambda x: _pad_sent(x["text_wd"]), new_field_name="pad_text_wd") ds.apply(lambda x: _token_mask(x["text_wd"]), new_field_name="pad_token_mask") # pad document ds.apply(lambda x: _pad_doc(x["pad_text_wd"]), new_field_name="pad_text") ds.apply(lambda x: _sent_mask(x["pad_text_wd"]), new_field_name="seq_len") ds.apply(lambda x: _pad_label(x["flatten_label"]), new_field_name="pad_label") # rename field ds.rename_field("pad_text", Const.INPUT) ds.rename_field("seq_len", Const.INPUT_LEN) ds.rename_field("pad_label", Const.TARGET) # set input and target ds.set_input(Const.INPUT, Const.INPUT_LEN) ds.set_target(Const.TARGET, Const.INPUT_LEN) datasets[key] = ds if "train" in key: train_ds = datasets[key] vocab_dict = {} if load_vocab == False: logger.info("[INFO] Build new vocab from training dataset!") if train_ds == None: raise ValueError("Lack train file to build vocabulary!") vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) vocabs.from_dataset(train_ds, field_name=["text_wd", "summary_wd"]) vocab_dict["vocab"] = vocabs else: logger.info("[INFO] Load existing vocab from %s!" % vocab_path) word_list = [] with open(vocab_path, 'r', encoding='utf8') as vocab_f: cnt = 2 # pad and unk for line in vocab_f: pieces = line.split("\t") word_list.append(pieces[0]) cnt += 1 if cnt > vocab_size: break vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) vocabs.add_word_lst(word_list) vocabs.build_vocab() vocab_dict["vocab"] = vocabs if domain == True: domaindict = Vocabulary(padding=None, unknown=DOMAIN_UNK) domaindict.from_dataset(train_ds, field_name="publication") vocab_dict["domain"] = domaindict if tag == True: tagdict = Vocabulary(padding=None, unknown=TAG_UNK) tagdict.from_dataset(train_ds, field_name="tag") vocab_dict["tag"] = tagdict for ds in datasets.values(): vocab_dict["vocab"].index_dataset(ds, field_name=Const.INPUT, new_field_name=Const.INPUT) return DataInfo(vocabs=vocab_dict, datasets=datasets)
class VocabIndexerProcessor(Processor): """ 根据DataSet创建Vocabulary,并将其用数字index。新生成的index的field会被放在new_added_filed_name, 如果没有提供 new_added_field_name, 则覆盖原有的field_name. """ def __init__(self, field_name, new_added_filed_name=None, min_freq=1, max_size=None, verbose=0, is_input=True): """ :param field_name: 从哪个field_name创建词表,以及对哪个field_name进行index操作 :param new_added_filed_name: index时,生成的index field的名称,如果不传入,则覆盖field_name. :param min_freq: 创建的Vocabulary允许的单词最少出现次数. :param max_size: 创建的Vocabulary允许的最大的单词数量 :param verbose: 0, 不输出任何信息;1,输出信息 :param bool is_input: """ super(VocabIndexerProcessor, self).__init__(field_name, new_added_filed_name) self.min_freq = min_freq self.max_size = max_size self.verbose =verbose self.is_input = is_input def construct_vocab(self, *datasets): """ 使用传入的DataSet创建vocabulary :param datasets: DataSet类型的数据,用于构建vocabulary :return: """ self.vocab = Vocabulary(min_freq=self.min_freq, max_size=self.max_size) for dataset in datasets: assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) dataset.apply(lambda ins: self.vocab.update(ins[self.field_name])) self.vocab.build_vocab() if self.verbose: print("Vocabulary Constructed, has {} items.".format(len(self.vocab))) def process(self, *datasets, only_index_dataset=None): """ 若还未建立Vocabulary,则使用dataset中的DataSet建立vocabulary;若已经有了vocabulary则使用已有的vocabulary。得到vocabulary 后,则会index datasets与only_index_dataset。 :param datasets: DataSet类型的数据 :param only_index_dataset: DataSet, or list of DataSet. 该参数中的内容只会被用于index,不会被用于生成vocabulary。 :return: """ if len(datasets)==0 and not hasattr(self,'vocab'): raise RuntimeError("You have to construct vocabulary first. Or you have to pass datasets to construct it.") if not hasattr(self, 'vocab'): self.construct_vocab(*datasets) else: if self.verbose: print("Using constructed vocabulary with {} items.".format(len(self.vocab))) to_index_datasets = [] if len(datasets)!=0: for dataset in datasets: assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) to_index_datasets.append(dataset) if not (only_index_dataset is None): if isinstance(only_index_dataset, list): for dataset in only_index_dataset: assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) to_index_datasets.append(dataset) elif isinstance(only_index_dataset, DataSet): to_index_datasets.append(only_index_dataset) else: raise TypeError('Only DataSet or list of DataSet is allowed, not {}.'.format(type(only_index_dataset))) for dataset in to_index_datasets: assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]], new_field_name=self.new_added_field_name, is_input=self.is_input) # 只返回一个,infer时为了跟其他processor保持一致 if len(to_index_datasets) == 1: return to_index_datasets[0] def set_vocab(self, vocab): assert isinstance(vocab, Vocabulary), "Only fastNLP.core.Vocabulary is allowed, not {}.".format(type(vocab)) self.vocab = vocab def delete_vocab(self): del self.vocab def get_vocab_size(self): return len(self.vocab) def set_verbose(self, verbose): """ 设置processor verbose状态。 :param verbose: int, 0,不输出任何信息;1,输出vocab 信息。 :return: """ self.verbose = verbose
def process( self, paths: Union[str, Dict[str, str]], dataset_name: str = None, to_lower=False, seq_len_type: str = None, bert_tokenizer: str = None, cut_text: int = None, get_index=True, auto_pad_length: int = None, auto_pad_token: str = '<pad>', set_input: Union[list, str, bool] = True, set_target: Union[list, str, bool] = True, concat: Union[str, list, bool] = None, ) -> DataBundle: """ :param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹, 则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和 对应的全路径文件名。 :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义 这个数据集的名字,如果不定义则默认为train。 :param bool to_lower: 是否将文本自动转为小写。默认值为False。 :param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` : 提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和 attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径 :param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。 :param bool get_index: 是否需要根据词表将文本转为index :param int auto_pad_length: 是否需要将文本自动pad到一定长度(超过这个长度的文本将会被截掉),默认为不会自动pad :param str auto_pad_token: 自动pad的内容 :param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False 则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input, 于此同时其他field不会被设置为input。默认值为True。 :param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。 :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个<sep>。 如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果 传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]']. :return: """ if isinstance(set_input, str): set_input = [set_input] if isinstance(set_target, str): set_target = [set_target] if isinstance(set_input, bool): auto_set_input = set_input else: auto_set_input = False if isinstance(set_target, bool): auto_set_target = set_target else: auto_set_target = False if isinstance(paths, str): if os.path.isdir(paths): path = { n: os.path.join(paths, self.paths[n]) for n in self.paths.keys() } else: path = { dataset_name if dataset_name is not None else 'train': paths } else: path = paths data_info = DataBundle() for data_name in path.keys(): data_info.datasets[data_name] = self._load(path[data_name]) for data_name, data_set in data_info.datasets.items(): if auto_set_input: data_set.set_input(Const.INPUTS(0), Const.INPUTS(1)) if auto_set_target: if Const.TARGET in data_set.get_field_names(): data_set.set_target(Const.TARGET) if to_lower: for data_name, data_set in data_info.datasets.items(): data_set.apply( lambda x: [w.lower() for w in x[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0), is_input=auto_set_input) data_set.apply( lambda x: [w.lower() for w in x[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1), is_input=auto_set_input) if bert_tokenizer is not None: if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR: PRETRAIN_URL = _get_base_url('bert') model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer] model_url = PRETRAIN_URL + model_name model_dir = cached_path(model_url) # 检查是否存在 elif os.path.isdir(bert_tokenizer): model_dir = bert_tokenizer else: raise ValueError( f"Cannot recognize BERT tokenizer from {bert_tokenizer}.") words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]') with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f: lines = f.readlines() lines = [line.strip() for line in lines] words_vocab.add_word_lst(lines) words_vocab.build_vocab() tokenizer = BertTokenizer.from_pretrained(model_dir) for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply( lambda x: tokenizer.tokenize(' '.join(x[fields])), new_field_name=fields, is_input=auto_set_input) if isinstance(concat, bool): concat = 'default' if concat else None if concat is not None: if isinstance(concat, str): CONCAT_MAP = { 'bert': ['[CLS]', '[SEP]', '', '[SEP]'], 'default': ['', '<sep>', '', ''] } if concat.lower() in CONCAT_MAP: concat = CONCAT_MAP[concat] else: concat = 4 * [concat] assert len(concat) == 4, \ f'Please choose a list with 4 symbols which at the beginning of first sentence ' \ f'the end of first sentence, the begin of second sentence, and the end of second' \ f'sentence. Your input is {concat}' for data_name, data_set in data_info.datasets.items(): data_set.apply( lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[ 1]] + [concat[2]] + x[Const.INPUTS(1)] + [concat[3]], new_field_name=Const.INPUT) data_set.apply( lambda x: [w for w in x[Const.INPUT] if len(w) > 0], new_field_name=Const.INPUT, is_input=auto_set_input) if seq_len_type is not None: if seq_len_type == 'seq_len': # for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply(lambda x: len(x[fields]), new_field_name=fields.replace( Const.INPUT, Const.INPUT_LEN), is_input=auto_set_input) elif seq_len_type == 'mask': for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply(lambda x: [1] * len(x[fields]), new_field_name=fields.replace( Const.INPUT, Const.INPUT_LEN), is_input=auto_set_input) elif seq_len_type == 'bert': for data_name, data_set in data_info.datasets.items(): if Const.INPUT not in data_set.get_field_names(): raise KeyError( f'Field ``{Const.INPUT}`` not in {data_name} data set: ' f'got {data_set.get_field_names()}') data_set.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1), new_field_name=Const.INPUT_LENS(0), is_input=auto_set_input) data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input) if auto_pad_length is not None: cut_text = min( auto_pad_length, cut_text if cut_text is not None else auto_pad_length) if cut_text is not None: for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')): data_set.apply(lambda x: x[fields][:cut_text], new_field_name=fields, is_input=auto_set_input) data_set_list = [d for n, d in data_info.datasets.items()] assert len(data_set_list) > 0, f'There are NO data sets in data info!' if bert_tokenizer is None: words_vocab = Vocabulary(padding=auto_pad_token) words_vocab = words_vocab.from_dataset( *[d for n, d in data_info.datasets.items() if 'train' in n], field_name=[ n for n in data_set_list[0].get_field_names() if (Const.INPUT in n) ], no_create_entry_dataset=[ d for n, d in data_info.datasets.items() if 'train' not in n ]) target_vocab = Vocabulary(padding=None, unknown=None) target_vocab = target_vocab.from_dataset( *[d for n, d in data_info.datasets.items() if 'train' in n], field_name=Const.TARGET) data_info.vocabs = { Const.INPUT: words_vocab, Const.TARGET: target_vocab } if get_index: for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply( lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields, is_input=auto_set_input) if Const.TARGET in data_set.get_field_names(): data_set.apply( lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET, is_input=auto_set_input, is_target=auto_set_target) if auto_pad_length is not None: if seq_len_type == 'seq_len': raise RuntimeError( f'the sequence will be padded with the length {auto_pad_length}, ' f'so the seq_len_type cannot be `{seq_len_type}`!') for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply( lambda x: x[fields] + [words_vocab.to_index(words_vocab.padding)] * (auto_pad_length - len(x[fields])), new_field_name=fields, is_input=auto_set_input) elif (Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len'): data_set.apply(lambda x: x[fields] + [0] * (auto_pad_length - len(x[fields])), new_field_name=fields, is_input=auto_set_input) for data_name, data_set in data_info.datasets.items(): if isinstance(set_input, list): data_set.set_input(*[ inputs for inputs in set_input if inputs in data_set.get_field_names() ]) if isinstance(set_target, list): data_set.set_target(*[ target for target in set_target if target in data_set.get_field_names() ]) return data_info