Exemplo n.º 1
0
 def __init__(self,
              heuristic_scores_file="",
              collect_stats_strategy='best'):
     """Creates a new ``StatsHeuristic`` instance. The constructor
     initializes the unigram table.
     
     Args:
         heuristic_scores_file (string): Path to the unigram scores 
                                         which are used if this 
                                         predictor estimates future
                                         costs
         collect_stats_strategy (string): best, full, or all. Defines 
                                          how unigram estimates are 
                                          collected for heuristic
     """
     super(StatsHeuristic, self).__init__()
     if heuristic_scores_file:
         self.estimates = FileUnigramTable(heuristic_scores_file)
     elif collect_stats_strategy == 'best':
         self.estimates = BestStatsUnigramTable()
     elif collect_stats_strategy == 'full':
         self.estimates = FullStatsUnigramTable()
     elif collect_stats_strategy == 'all':
         self.estimates = AllStatsUnigramTable()
     else:
         logging.error("Unknown statistics collection strategy")
Exemplo n.º 2
0
 def __init__(self,
              trg_test_file,
              accept_subsets=False,
              accept_duplicates=False,
              heuristic_scores_file="",
              collect_stats_strategy='best',
              heuristic_add_consumed=False,
              heuristic_add_remaining=True,
              diversity_heuristic_factor=-1.0,
              equivalence_vocab=-1):
     """Creates a new bag-of-words predictor.
     
     Args:
         trg_test_file (string): Path to the plain text file with 
                                 the target sentences. Must have the
                                 same number of lines as the number
                                 of source sentences to decode. The 
                                 word order in the target sentences
                                 is not relevant for this predictor.
         accept_subsets (bool): If true, this predictor permits
                                    EOS even if the bag is not fully
                                    consumed yet
         accept_duplicates (bool): If true, counts are not updated
                                   when a word is consumed. This
                                   means that we allow a word in a
                                   bag to appear multiple times
         heuristic_scores_file (string): Path to the unigram scores 
                                         which are used if this 
                                         predictor estimates future
                                         costs
         collect_stats_strategy (string): best, full, or all. Defines 
                                          how unigram estimates are 
                                          collected for heuristic 
         heuristic_add_consumed (bool): Set to true to add the 
                                        difference between actual
                                        partial score and unigram
                                        estimates of consumed words
                                        to the predictor heuristic
         heuristic_add_remaining (bool): Set to true to add the sum
                                         of unigram scores of words
                                         remaining in the bag to the
                                         predictor heuristic
         diversity_heuristic_factor (float): Factor for diversity
                                             heuristic which 
                                             penalizes hypotheses
                                             with the same bag as
                                             full hypos
         equivalence_vocab (int): If positive, predictor states are
                                  considered equal if the the 
                                  remaining words within that vocab
                                  and OOVs regarding this vocab are
                                  the same. Only relevant when using
                                  hypothesis recombination
     """
     super(BagOfWordsPredictor, self).__init__()
     with open(trg_test_file) as f:
         self.lines = f.read().splitlines()
     if heuristic_scores_file:
         self.estimates = FileUnigramTable(heuristic_scores_file)
     elif collect_stats_strategy == 'best':
         self.estimates = BestStatsUnigramTable()
     elif collect_stats_strategy == 'full':
         self.estimates = FullStatsUnigramTable()
     elif collect_stats_strategy == 'all':
         self.estimates = AllStatsUnigramTable()
     else:
         logging.error("Unknown statistics collection strategy")
     self.accept_subsets = accept_subsets
     self.accept_duplicates = accept_duplicates
     self.heuristic_add_consumed = heuristic_add_consumed
     self.heuristic_add_remaining = heuristic_add_remaining
     self.equivalence_vocab = equivalence_vocab
     if accept_duplicates and not accept_subsets:
         logging.error("You enabled bow_accept_duplicates but not bow_"
                       "accept_subsets. Therefore, the bow predictor will "
                       "never accept end-of-sentence and could cause "
                       "an infinite loop in the search strategy.")
     self.diversity_heuristic_factor = diversity_heuristic_factor
     self.diverse_heuristic = (diversity_heuristic_factor > 0.0)