def process(self, paths, train_ds: Iterable[str] = None, src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None, src_embed_op: EmbeddingOption = None): input_name, target_name = 'words', 'target' src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary( **src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) info = DataBundle(datasets=self.load(paths)) _train_ds = [info.datasets[name] for name in train_ds ] if train_ds else info.datasets.values() src_vocab.from_dataset(*_train_ds, field_name=input_name) tgt_vocab.from_dataset(*_train_ds, field_name=target_name) src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name) tgt_vocab.index_dataset(*info.datasets.values(), field_name=target_name, new_field_name=target_name) info.vocabs = {input_name: src_vocab, target_name: tgt_vocab} if src_embed_op is not None: src_embed_op.vocab = src_vocab init_emb = EmbedLoader.load_with_vocab(**src_embed_op) info.embeddings[input_name] = init_emb for name, dataset in info.datasets.items(): dataset.set_input(input_name) dataset.set_target(target_name) return info
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None, char_level_op=False): paths = check_dataloader_paths(paths) datasets = {} info = DataBundle() for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars input_name, target_name = 'words', 'target' info.vocabs = {} # 就分隔为char形式 if char_level_op: for dataset in datasets.values(): dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary( **src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = {"words": src_vocab, "target": tgt_vocab} info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def _indexize(data_bundle, input_field_names=Const.INPUT, target_field_names=Const.TARGET, vocabulary=None): if isinstance(input_field_names, str): input_field_names = [input_field_names] if isinstance(target_field_names, str): target_field_names = [target_field_names] for input_field_name in input_field_names: if vocabulary is None: src_vocab = Vocabulary() src_vocab.from_dataset( *[ ds for name, ds in data_bundle.iter_datasets() if 'train' in name ], field_name=input_field_name, no_create_entry_dataset=[ ds for name, ds in data_bundle.iter_datasets() if ('train' not in name) and ( ds.has_field(input_field_name)) ]) else: src_vocab = vocabulary src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name) data_bundle.set_vocab(src_vocab, input_field_name) for target_field_name in target_field_names: tgt_vocab = Vocabulary(unknown=None, padding=None) tgt_vocab.from_dataset( *[ ds for name, ds in data_bundle.iter_datasets() if 'train' in name ], field_name=target_field_name, no_create_entry_dataset=[ ds for name, ds in data_bundle.iter_datasets() if ('train' not in name) and (ds.has_field(target_field_name)) ]) if len(tgt_vocab._no_create_word) > 0: warn_msg = f"There are {len(tgt_vocab._no_create_word)} `{target_field_name}` labels" \ f" in {[name for name in data_bundle.datasets.keys() if 'train' not in name]} " \ f"data set but not in train data set!.\n" \ f"These label(s) are {tgt_vocab._no_create_word}" print(warn_msg) tgt_vocab.index_dataset(*[ ds for ds in data_bundle.datasets.values() if ds.has_field(target_field_name) ], field_name=target_field_name) data_bundle.set_vocab(tgt_vocab, target_field_name) return data_bundle
def setup(self, stage="train"): if stage == 'train': data = self.hparams.data # build dataset # indexes: the ith column of the conll file. it depends on your file and may need modification. loader = ConllLoader([word, pos, head], indexes=[1, 3, 6]) train_dataset = loader._load(data.train_file) val_dataset = loader._load(data.val_file) test_dataset = loader._load(data.test_file) def clean_word(words): def clean_number(word): def is_number(s): try: float(s) return True except ValueError: return False if is_number(word): return '0' else: return word # import re # def clean_number(w): # new_w = re.sub('[0-9]{1,}([,.]?[0-9]*)*', '0', w) # return new_w return [clean_number(word) for word in words] def numerize(heads): return [int(head) for head in heads] train_dataset.apply_field(clean_word, word, new_field_name=word) val_dataset.apply_field(clean_word, word, new_field_name=word) test_dataset.apply_field(clean_word, word, new_field_name=word) test_dataset.apply_field(numerize, head, new_field_name=head) train_dataset.add_seq_len(field_name=word, new_field_name=seq_len) val_dataset.add_seq_len(field_name=word, new_field_name=seq_len) test_dataset.add_seq_len(field_name=word, new_field_name=seq_len) pos_vocab = Vocabulary() pos_vocab.from_dataset(train_dataset, field_name=pos) if data.wordposastoken: ''' combining pos tag and word as a single token. Largely speaking, we build the vocabulary based on the co-occurance of (NT, 'word') Then, we replace all unknown word with their corresponding POS tag. Please refer "Dependency Grammar Induction with Neural Lexicalization and Big Training Data" for details. ''' def combine(x): sent = list(zip(x[pos], x[word])) return [x[0] + "_" + x[1] for x in sent] train_dataset.apply(combine, new_field_name=word) val_dataset.apply(combine, new_field_name=word) test_dataset.apply(combine, new_field_name=word) word_vocab = Vocabulary(min_freq=data.min_freq) word_vocab.from_dataset(train_dataset, field_name=word) ''' Replace the unknown word with their POS tag. ''' word_vocab.add_word_lst(pos_vocab.word2idx) word_vocab.index_dataset(train_dataset, field_name=word) word_vocab.index_dataset(val_dataset, field_name=word) word_vocab.index_dataset(test_dataset, field_name=word) unk = 1 def replace(x): poses = x[pos] words = x[word] for i in range(len(words)): # 1 stands for unk. we replace the unknown word with its POS tags. if words[i] == unk: pos_tag_name = poses[i] words[i] = word_vocab[pos_tag_name] return words train_dataset.apply(replace, new_field_name=word) val_dataset.apply(replace, new_field_name=word) test_dataset.apply(replace, new_field_name=word) if data.use_emb: if data.emb_type == 'fasttext': model = FastText.load(data.embedding) else: raise NotImplementedError word_vec = model.wv emb = np.random.rand(len(word_vocab), data.word_emb_size) for idx, w in word_vocab.idx2word.items(): if "_" in w: w = w.split('_')[-1] emb[idx] = word_vec[w] emb = torch.from_numpy(emb) self.pretrained_emb = emb.to(self.device).float() word2pos = np.zeros(shape=(len(word_vocab),)) # to match each token in vocabulary with its corresponding POS tag. for idx, w in word_vocab.idx2word.items(): if idx == 0: continue if idx == 1: word2pos[1] = 1 continue if "_" in w: pos_tag_name = w.split("_")[0] word2pos[idx] = pos_vocab.word2idx[pos_tag_name] else: word2pos[idx] = pos_vocab.word2idx[w] self.word2pos = torch.from_numpy(word2pos).long().to(self.device) # if not combine pos/word as a single token. else: # choose the create the vocabulary with fix size or based on the word frequency. if data.vocab_type == 'max_size': word_vocab = Vocabulary(max_size=data.vocab_size) else: word_vocab = Vocabulary(min_freq=data.min_freq) word_vocab.from_dataset(train_dataset, field_name=word) word_vocab.index_dataset(train_dataset, field_name=word) word_vocab.index_dataset(val_dataset, field_name=word) word_vocab.index_dataset(test_dataset, field_name=word) train_dataset.set_input(pos, word, seq_len) val_dataset.set_input(pos, word, seq_len) test_dataset.set_input(pos, word, seq_len) test_dataset.set_target(head) pos_vocab.index_dataset(train_dataset, field_name=pos) pos_vocab.index_dataset(val_dataset, field_name=pos) pos_vocab.index_dataset(test_dataset, field_name=pos) train_dataset_init = None ''' Use external unsupervised parser's parse result as "psudo-gold-tree" to initialize our model. ''' if self.hparams.train.initializer == 'external': # dependent on your file format. conll_loader = ConllLoader([word, pos, head], indexes=[1, 4, 6]) train_dataset_init = conll_loader._load(data.external_parser) train_dataset_init.add_seq_len(field_name=word, new_field_name=seq_len) train_dataset_init.apply_field(clean_word, word, new_field_name=word) train_dataset_init.apply_field(numerize, head, new_field_name=head) if not data.wordposastoken: word_vocab.index_dataset(train_dataset_init, field_name=word) else: train_dataset_init.apply(combine, new_field_name=word) word_vocab.index_dataset(train_dataset_init, field_name=word) train_dataset_init.apply(replace, new_field_name=word) pos_vocab.index_dataset(train_dataset_init, field_name=pos) if self.hparams.joint_training: import copy train_dataset_init_for_model2 = copy.deepcopy(train_dataset_init) # first-order model if (self.hparams.model.model_name == 'NeuralDMV') or (self.hparams.model.model_name == 'LexicalizedNDMV'): rule_generator = RuleGenerator1o() # second-order model elif self.hparams.model.model_name == 'SiblingNDMV': rule_generator = RuleGeneratorSib() elif self.hparams.model.model_name == 'JointFirstSecond': rule_generator = RuleGenerator1o() rule_generator_for_model2 = RuleGeneratorSib() else: raise NameError self.setup_init_dataset(train_dataset_init, rule_generator) if self.hparams.joint_training: self.setup_init_dataset(train_dataset_init_for_model2, rule_generator_for_model2) elif self.hparams.train.initializer == 'km': train_dataset_init = train_dataset self.pos_vocab = pos_vocab self.word_vocab = word_vocab self.train_dataset = train_dataset self.val_dataset = val_dataset self.test_dataset = test_dataset self.train_dataset_init = train_dataset_init if self.hparams.joint_training: self.train_dataset_init_for_model2 = train_dataset_init_for_model2 else: raise NotImplementedError