def __init__(self, heuristic_scores_file="", collect_stats_strategy='best'): """Creates a new ``StatsHeuristic`` instance. The constructor initializes the unigram table. Args: heuristic_scores_file (string): Path to the unigram scores which are used if this predictor estimates future costs collect_stats_strategy (string): best, full, or all. Defines how unigram estimates are collected for heuristic """ super(StatsHeuristic, self).__init__() if heuristic_scores_file: self.estimates = FileUnigramTable(heuristic_scores_file) elif collect_stats_strategy == 'best': self.estimates = BestStatsUnigramTable() elif collect_stats_strategy == 'full': self.estimates = FullStatsUnigramTable() elif collect_stats_strategy == 'all': self.estimates = AllStatsUnigramTable() else: logging.error("Unknown statistics collection strategy")
def __init__(self, trg_test_file, accept_subsets=False, accept_duplicates=False, heuristic_scores_file="", collect_stats_strategy='best', heuristic_add_consumed=False, heuristic_add_remaining=True, diversity_heuristic_factor=-1.0, equivalence_vocab=-1): """Creates a new bag-of-words predictor. Args: trg_test_file (string): Path to the plain text file with the target sentences. Must have the same number of lines as the number of source sentences to decode. The word order in the target sentences is not relevant for this predictor. accept_subsets (bool): If true, this predictor permits EOS even if the bag is not fully consumed yet accept_duplicates (bool): If true, counts are not updated when a word is consumed. This means that we allow a word in a bag to appear multiple times heuristic_scores_file (string): Path to the unigram scores which are used if this predictor estimates future costs collect_stats_strategy (string): best, full, or all. Defines how unigram estimates are collected for heuristic heuristic_add_consumed (bool): Set to true to add the difference between actual partial score and unigram estimates of consumed words to the predictor heuristic heuristic_add_remaining (bool): Set to true to add the sum of unigram scores of words remaining in the bag to the predictor heuristic diversity_heuristic_factor (float): Factor for diversity heuristic which penalizes hypotheses with the same bag as full hypos equivalence_vocab (int): If positive, predictor states are considered equal if the the remaining words within that vocab and OOVs regarding this vocab are the same. Only relevant when using hypothesis recombination """ super(BagOfWordsPredictor, self).__init__() with open(trg_test_file) as f: self.lines = f.read().splitlines() if heuristic_scores_file: self.estimates = FileUnigramTable(heuristic_scores_file) elif collect_stats_strategy == 'best': self.estimates = BestStatsUnigramTable() elif collect_stats_strategy == 'full': self.estimates = FullStatsUnigramTable() elif collect_stats_strategy == 'all': self.estimates = AllStatsUnigramTable() else: logging.error("Unknown statistics collection strategy") self.accept_subsets = accept_subsets self.accept_duplicates = accept_duplicates self.heuristic_add_consumed = heuristic_add_consumed self.heuristic_add_remaining = heuristic_add_remaining self.equivalence_vocab = equivalence_vocab if accept_duplicates and not accept_subsets: logging.error("You enabled bow_accept_duplicates but not bow_" "accept_subsets. Therefore, the bow predictor will " "never accept end-of-sentence and could cause " "an infinite loop in the search strategy.") self.diversity_heuristic_factor = diversity_heuristic_factor self.diverse_heuristic = (diversity_heuristic_factor > 0.0)