def alternate_model_score(self, beam, hyp_word, prediction_score): separator_index = beam.source_text.index("|||") src_text = ' '.join(beam.source_text[:separator_index]) tgt_text = ' '.join(hyp_word) # print(src_text) # print(tgt_text) example = {"src":src_text, "tgt":tgt_text} src_ex_vocab, ex_dict = _dynamic_dict(example, self.src_field, self.tgt_field) src_map = ex_dict["src_map"] # print(src_map) # print(src_map.size()) # exit() # Get the source src_list = self.src_to_index(beam.source_text) # beam's source is target here hyp_list = self.tgt_to_index(hyp_word) # Src list to torch LongTensor hyp_list.insert(0, beam._bos) hyp_list.append(beam._eos) hyp_data = self.tt.LongTensor(hyp_list) src_lengths = self.tt.LongTensor([len(src_list) - 1]) # print("src word:", src_word) # print("src_text:", beam.source_text) src = self.tt.LongTensor(src_list[:-1]) # Remove EOS from the src which is the src to the MMI model hyp_data.unsqueeze_(1) src.unsqueeze_(1) # print src score = self._run_target(src, hyp_data, src_lengths, src_map) / float(len(hyp_list)) # print hyp_word # print score # print self.topic_score_multiplier * prediction_score, self.alternate_model_score_multiplier * float(score), self.gamma * len(hyp_word) # print hyp_word # print "" return self.prediction_score_multiper * prediction_score + self.alternate_model_score_multiplier * score / float(len(hyp_list)-1)
def _maybe_add_dynamic_dict(self, example, fields): """maybe update `example` with dynamic_dict related fields.""" if 'src_map' in fields and 'alignment' in fields: example = _dynamic_dict( example, fields['src'].base_field, fields['tgt'].base_field) return example
def __init__(self, fields, readers, data, dirs, sort_key, filter_pred=None, tgt_type=None): # this is set at line 594 in inputter.py and line 303 in translator.py self.tgt_type = tgt_type # concatenate multiple tgt sequences with <sep> or keep them separate as a list of seqs (2D tensor) self.concat_tgt = False self.sort_key = sort_key # will be specified before training, one of [one2one, original, random, verbatim] # build src_map/alignment no matter field is available can_copy = True read_iters = [ r.read(dat[1], dat[0], dir_) for r, dat, dir_ in zip(readers, data, dirs) ] # self.src_vocabs is used in collapse_copy_scores and Translator.py self.src_vocabs = [] examples = [] for ex_dict in starmap(_join_dicts, zip(*read_iters)): if can_copy: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text src_ex_vocab, ex_dict = _dynamic_dict(ex_dict, src_field.base_field, tgt_field.base_field) self.src_vocabs.append(src_ex_vocab) ex_fields = { k: [(k, v)] for k, v in fields.items() if k in ex_dict } ex = Example.fromdict(ex_dict, ex_fields) examples.append(ex) # fields needs to have only keys that examples have as attrs fields = [] for _, nf_list in ex_fields.items(): assert len(nf_list) == 1 fields.append(nf_list[0]) super(KeyphraseDataset, self).__init__(examples, fields, filter_pred)
def build_dynamic_dict_and_masks_parallel(read_iters, fields, boseos_added, alignment_loss, alignment_targets, multi_process=False): src_vocabs = [] stemmed_src_vocabs = [] ex_dicts = [] if multi_process: partial_fn = partial(_dynamic_dict, src_field=fields['src'].base_field, tgt_field=fields['tgt'].base_field, boseos_added=boseos_added) with Pool(processes=4) as pool: for src_ex_vocab, example in tqdm( pool.imap(partial_fn, starmap(_join_dicts, zip(*read_iters))), desc= 'Preparing src and tgt w/ multi-processing (tokenizing and field tokens)' ): src_vocabs.append(src_ex_vocab) ex_dicts.append(example) """ print('Processing news examples w/ multiple processing (building dynamic_dict)') start_time = time.clock() pool = Pool() processed_list = pool.map(partial(_dynamic_dict, src_field=fields['src'].base_field, tgt_field=fields['tgt'].base_field), starmap(_join_dicts, zip(*read_iters))) end_time = time.clock() src_vocabs = [i[0] for i in processed_list] ex_dicts = [i[1] for i in processed_list] print("Process finished, elapsed time=%.4f, speed=%.2f it/s" % (end_time-start_time, len(processed_list)/(end_time-start_time))) """ else: for ex_dict in tqdm( starmap(_join_dicts, zip(*read_iters)), desc= 'Processing news examples w/ single processing (building dynamic_dict)' ): if hasattr(fields['src'], 'base_field'): src_field = fields['src'].base_field tgt_field = fields['tgt'].base_field else: src_field = fields['src'] tgt_field = fields['tgt'] # this assumes src_field and tgt_field are both text ex_dict, src_ex_vocab, stemmed_src_ex_vocab = _dynamic_dict( ex_dict, src_field, tgt_field, boseos_added=boseos_added, alignment_loss=alignment_loss, alignment_targets=alignment_targets) src_vocabs.append(src_ex_vocab) stemmed_src_vocabs.append(stemmed_src_ex_vocab) ex_dicts.append(ex_dict) return ex_dicts, src_vocabs, stemmed_src_vocabs