예제 #1
0
 def alternate_model_score(self, beam, hyp_word, prediction_score):
     separator_index = beam.source_text.index("|||")
     src_text = ' '.join(beam.source_text[:separator_index])
     tgt_text = ' '.join(hyp_word)
     # print(src_text)
     # print(tgt_text)
     example = {"src":src_text, "tgt":tgt_text}
     src_ex_vocab, ex_dict = _dynamic_dict(example, self.src_field, self.tgt_field)
     src_map = ex_dict["src_map"]
     # print(src_map)
     # print(src_map.size())
     # exit()
     # Get the source
     src_list = self.src_to_index(beam.source_text) # beam's source is target here
     hyp_list = self.tgt_to_index(hyp_word)
     # Src list to torch LongTensor 
     hyp_list.insert(0, beam._bos)
     hyp_list.append(beam._eos)
     hyp_data = self.tt.LongTensor(hyp_list)
     src_lengths = self.tt.LongTensor([len(src_list) - 1])
     
     # print("src word:", src_word)
     # print("src_text:", beam.source_text)
     src = self.tt.LongTensor(src_list[:-1])         # Remove EOS from the src which is the src to the MMI model
     hyp_data.unsqueeze_(1)
     src.unsqueeze_(1)
     # print src
     score = self._run_target(src, hyp_data, src_lengths, src_map) / float(len(hyp_list))
     # print hyp_word
     # print score
     # print self.topic_score_multiplier * prediction_score, self.alternate_model_score_multiplier * float(score), self.gamma * len(hyp_word)
     # print hyp_word
     # print ""
     return self.prediction_score_multiper * prediction_score + self.alternate_model_score_multiplier * score / float(len(hyp_list)-1)
예제 #2
0
 def _maybe_add_dynamic_dict(self, example, fields):
     """maybe update `example` with dynamic_dict related fields."""
     if 'src_map' in fields and 'alignment' in fields:
         example = _dynamic_dict(
             example,
             fields['src'].base_field,
             fields['tgt'].base_field)
     return example
    def __init__(self,
                 fields,
                 readers,
                 data,
                 dirs,
                 sort_key,
                 filter_pred=None,
                 tgt_type=None):
        # this is set at line 594 in inputter.py and line 303 in translator.py
        self.tgt_type = tgt_type
        # concatenate multiple tgt sequences with <sep> or keep them separate as a list of seqs (2D tensor)
        self.concat_tgt = False
        self.sort_key = sort_key

        # will be specified before training, one of [one2one, original, random, verbatim]

        # build src_map/alignment no matter field is available
        can_copy = True

        read_iters = [
            r.read(dat[1], dat[0], dir_)
            for r, dat, dir_ in zip(readers, data, dirs)
        ]

        # self.src_vocabs is used in collapse_copy_scores and Translator.py
        self.src_vocabs = []
        examples = []
        for ex_dict in starmap(_join_dicts, zip(*read_iters)):
            if can_copy:
                src_field = fields['src']
                tgt_field = fields['tgt']
                # this assumes src_field and tgt_field are both text
                src_ex_vocab, ex_dict = _dynamic_dict(ex_dict,
                                                      src_field.base_field,
                                                      tgt_field.base_field)
                self.src_vocabs.append(src_ex_vocab)
            ex_fields = {
                k: [(k, v)]
                for k, v in fields.items() if k in ex_dict
            }
            ex = Example.fromdict(ex_dict, ex_fields)
            examples.append(ex)

        # fields needs to have only keys that examples have as attrs
        fields = []
        for _, nf_list in ex_fields.items():
            assert len(nf_list) == 1
            fields.append(nf_list[0])

        super(KeyphraseDataset, self).__init__(examples, fields, filter_pred)
예제 #4
0
def build_dynamic_dict_and_masks_parallel(read_iters,
                                          fields,
                                          boseos_added,
                                          alignment_loss,
                                          alignment_targets,
                                          multi_process=False):
    src_vocabs = []
    stemmed_src_vocabs = []
    ex_dicts = []
    if multi_process:
        partial_fn = partial(_dynamic_dict,
                             src_field=fields['src'].base_field,
                             tgt_field=fields['tgt'].base_field,
                             boseos_added=boseos_added)
        with Pool(processes=4) as pool:
            for src_ex_vocab, example in tqdm(
                    pool.imap(partial_fn, starmap(_join_dicts,
                                                  zip(*read_iters))),
                    desc=
                    'Preparing src and tgt w/ multi-processing (tokenizing and field tokens)'
            ):
                src_vocabs.append(src_ex_vocab)
                ex_dicts.append(example)
        """
        print('Processing news examples w/ multiple processing (building dynamic_dict)')
        start_time = time.clock()
        pool = Pool()
        processed_list = pool.map(partial(_dynamic_dict,
                                          src_field=fields['src'].base_field, tgt_field=fields['tgt'].base_field),
                                  starmap(_join_dicts, zip(*read_iters)))
        end_time = time.clock()
        src_vocabs = [i[0] for i in processed_list]
        ex_dicts = [i[1] for i in processed_list]
        print("Process finished, elapsed time=%.4f, speed=%.2f it/s" % (end_time-start_time,
                                                                        len(processed_list)/(end_time-start_time)))
        """
    else:
        for ex_dict in tqdm(
                starmap(_join_dicts, zip(*read_iters)),
                desc=
                'Processing news examples w/ single processing (building dynamic_dict)'
        ):
            if hasattr(fields['src'], 'base_field'):
                src_field = fields['src'].base_field
                tgt_field = fields['tgt'].base_field
            else:
                src_field = fields['src']
                tgt_field = fields['tgt']
            # this assumes src_field and tgt_field are both text
            ex_dict, src_ex_vocab, stemmed_src_ex_vocab = _dynamic_dict(
                ex_dict,
                src_field,
                tgt_field,
                boseos_added=boseos_added,
                alignment_loss=alignment_loss,
                alignment_targets=alignment_targets)
            src_vocabs.append(src_ex_vocab)
            stemmed_src_vocabs.append(stemmed_src_ex_vocab)
            ex_dicts.append(ex_dict)

    return ex_dicts, src_vocabs, stemmed_src_vocabs