def write_hypos(self, all_hypos, sen_indices=None): """Writes the hypotheses in ``all_hypos`` to ``path`` """ if self.f is not None: for hypos in all_hypos: self.f.write(utils.apply_trg_wmap(hypos[0].trgt_sentence, self.trg_wmap)) self.f.write("\n") self.f.flush() else: with codecs.open(self.path, "w", encoding='utf-8') as f: for hypos in all_hypos: f.write(utils.apply_trg_wmap(hypos[0].trgt_sentence, self.trg_wmap)) f.write("\n") self.f.flush()
def _process_inputs(sync_symbol=-1): """Helper method to support multiple input files.""" inputfiles = [args.src_test] while True: inputfile = getattr(args, "src_test%d" % (len(inputfiles) + 1), None) if not inputfile: break inputfiles.append(inputfile) # Read all input files inputs_tmp = [[] for i in xrange(len(inputfiles))] for i in xrange(len(inputfiles)): with codecs.open(inputfiles[i], encoding='utf-8') as f: for line in f: # logging.debug(u'utils.apply_src_wmap(sync_symbol): {}'.format(utils.apply_trg_wmap([sync_symbol]))) if utils.apply_trg_wmap([sync_symbol])[0] == " ": inputs_tmp[i].append([ c.replace('|', ' ') for c in line.strip().replace(' ', ' | ').split() ]) # logging.debug(u'line: {}'.format(line)) # logging.debug(u'mapped: {}'.format([c.replace('|',' ') for c in line.strip().replace(' ',' | ').split()])) else: inputs_tmp[i].append(line.strip().split()) # Gather multiple input sentences for each line inputs = [] for i in xrange(len(inputs_tmp[0])): input_lst = [] for j in xrange(len(inputfiles)): input_lst.append(inputs_tmp[j][i]) inputs.append(input_lst) return inputs
def write_hypos(self, all_hypos, sen_indices=None): """Writes the hypotheses in ``all_hypos`` to ``path`` """ if self.f is not None: for hypos in all_hypos: self.f.write( utils.apply_trg_wmap(hypos[0].trgt_sentence, self.trg_wmap)) self.f.write("\n") self.f.flush() else: with codecs.open(self.path, "w", encoding='utf-8') as f: for hypos in all_hypos: f.write( utils.apply_trg_wmap(hypos[0].trgt_sentence, self.trg_wmap)) f.write("\n") self.f.flush()
def decode(self, src_sentence): """Decodes a single source sentence using beam search. """ self.initialize_predictors(src_sentence) hypos = [PartialHypothesis(self.get_predictor_states())] it = 0 while self.stop_criterion(hypos): if it > self.max_len: # prevent infinite loops break it = it + 1 next_hypos = [] next_scores = [] self.min_score = utils.NEG_INF self.best_scores = [] print("HYPOS") for hypo in hypos: print( "it%d: %s (%f)" % (it, utils.apply_trg_wmap(hypo.trgt_sentence), hypo.score)) for hypo in hypos: print("H: %s (%f)" % (utils.apply_trg_wmap(hypo.trgt_sentence), hypo.score)) if hypo.get_last_word() == utils.EOS_ID: next_hypos.append(hypo) next_scores.append(self._get_combined_score(hypo)) continue for next_hypo in self._expand_hypo(hypo): next_score = self._get_combined_score(next_hypo) if next_score > self.min_score: next_hypos.append(next_hypo) next_scores.append(next_score) self._register_score(next_score) if self.hypo_recombination: hypos = self._filter_equal_hypos(next_hypos, next_scores) else: hypos = self._get_next_hypos(next_hypos, next_scores) for hypo in hypos: if hypo.get_last_word() == utils.EOS_ID: self.add_full_hypo(hypo.generate_full_hypothesis()) if not self.full_hypos: logging.warn("No complete hypotheses found for %s" % src_sentence) for hypo in hypos: self.add_full_hypo(hypo.generate_full_hypothesis()) return self.get_full_hypos_sorted()
def write_hypos(self, all_hypos, sen_indices): """Writes the hypotheses in ``all_hypos`` to ``path`` """ with codecs.open(self.path, "w", encoding='utf-8') as f: n_predictors = len(self.predictor_names) for idx, hypos in zip(sen_indices, all_hypos): for hypo in hypos: f.write("%d ||| %s ||| %s ||| %f" % (idx, utils.apply_trg_wmap(hypo.trgt_sentence, self.trg_wmap), ' '.join("%s= %f" % ( self.predictor_names[i], sum([s[i][0] for s in hypo.score_breakdown])) for i in xrange(n_predictors)), hypo.total_score)) f.write("\n") idx += 1
def consume(self, pred_id): """Feeds back ``pred_id`` to the decoder network. This includes embedding of ``pred_id``, running the attention network and update the recurrent decoder layer. """ logging.debug(u'nmt consumed: {}'.format( utils.apply_trg_wmap([pred_id]))) #SGNMT self.consumed.append(pred_id) #SGNMT inputs_id = [self.BEGIN] + self.consumed #SGNMT initial_state = self.decoder.initial_state() #SGNMT inputs_emb = [self.VOCAB_LOOKUP[c_id] for c_id in inputs_id] #SGNMT states = initial_state.transduce(inputs_emb) #SGNMT self.output_state = states[-1] #SGNMT # self.consume_next(pred_id)#NEW pass
def write_hypos(self, all_hypos, sen_indices): """Writes the hypotheses in ``all_hypos`` to ``path`` """ with codecs.open(self.path, "w", encoding='utf-8') as f: n_predictors = len(self.predictor_names) for idx, hypos in zip(sen_indices, all_hypos): for hypo in hypos: f.write( "%d ||| %s ||| %s ||| %f" % (idx, utils.apply_trg_wmap( hypo.trgt_sentence, self.trg_wmap), ' '.join( "%s= %f" % (self.predictor_names[i], sum([s[i][0] for s in hypo.score_breakdown])) for i in xrange(n_predictors)), hypo.total_score)) f.write("\n") idx += 1
def _process_input(sync_symbol=-1): """Helper method to support multiple input files. Handles sync symbol properly if it is space""" # Read the input file inputs_tmp = [] with codecs.open(args.src_test, encoding='utf-8') as f: for line in f: # logging.debug(u'utils.apply_src_wmap(sync_symbol): {}'.format(utils.apply_trg_wmap([sync_symbol]))) if utils.apply_trg_wmap([sync_symbol])[0] == " ": inputs_tmp.append([ c.replace('|', ' ') for c in line.strip().replace(' ', ' | ').split() ]) # logging.debug(u'line: {}'.format(line)) # logging.debug(u'mapped: {}'.format([c.replace('|',' ') for c in line.strip().replace(' ',' | ').split()])) else: inputs_tmp.append(line.strip().split()) return inputs_tmp
def write_hypos(self, all_hypos): """Writes the hypotheses in ``all_hypos`` to ``path`` """ n_predictors = len(self.predictor_names) idx = self.current_sen_id for hypos in all_hypos: for hypo in hypos: self.f.write("%d ||| %s ||| %s ||| %f" % ( idx, utils.apply_trg_wmap(hypo.trgt_sentence, self.trg_wmap), # ' '.join("%s=%f" % ( # self.predictor_names[i], # sum([s[i][0] for s in hypo.score_breakdown])) # for i in xrange(n_predictors)), ' '.join("%s" % (sum([s[i][0] for s in hypo.score_breakdown])) for i in xrange(n_predictors)), hypo.total_score)) self.f.write("\n") # idx += 1 self.current_sen_id += 1 self.f.flush()
def decode(self, src_sentence): """This is a generalization to NMT ensembles of ``DynetNMTVanillaDecoder``. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ dy.renew_cg() logging.debug(u'src_sentence: {}'.format(src_sentence)) MAX_PRED_SEQ_LEN = 30 #3*len(src_sentence) beam_size = self.beam_size nmt_models = self.nmt_models # nmt_vocab = nmt_models[0].vocab # same vocab file for all nmt_models!! # BEGIN = nmt_vocab.w2i[BEGIN_CHAR] BEGIN = utils.GO_ID STOP = utils.EOS_ID # STOP = nmt_vocab.w2i[STOP_CHAR] for m in nmt_models: m.initialize(src_sentence) states = [[m.s] * beam_size for m in nmt_models] # ensemble x beam matrix of states # This array will store all generated outputs, including those from # previous step and those from already finished sequences. all_outputs = np.full(shape=(1, beam_size), fill_value=BEGIN, dtype=int) all_masks = np.ones_like( all_outputs, dtype=float) # whether predicted symbol is self.STOP all_costs = np.zeros_like( all_outputs, dtype=float) # the cumulative cost of predictions for i in range(MAX_PRED_SEQ_LEN): if all_masks[-1].sum() == 0: logging.debug(u'check masks: {}'.format(all_masks[-1])) break # We carefully hack values of the `logprobs` array to ensure # that all finished sequences are continued with `eos_symbol`. logprobs_lst = [] for j, m in enumerate(nmt_models): logprobs_m = -np.array([m.predict_next_(s) for s in states[j] ]) # beam_size x vocab_len logprobs_lst.append(logprobs_m) logprobs = np.sum(logprobs_lst, axis=0) next_costs = ( all_costs[-1, :, None] + logprobs * all_masks[-1, :, None] ) #take last row of cumul prev costs and turn into beam_size X 1 matrix, take logprobs distributions for unfinished hypos only and add it (elem-wise) with the array of prev costs; result: beam_size x vocab_len matrix of next costs (finished, ) = np.where( all_masks[-1] == 0 ) # finished hypos have all their cost on the self.STOP symbol next_costs[finished, :STOP] = np.inf next_costs[finished, STOP + 1:] = np.inf # indexes - the hypos from prev step to keep, outputs - the next step prediction, chosen cost - cost of predicted symbol (indexes, outputs), chosen_costs = DynetNMTVanillaDecoder._smallest( next_costs, beam_size, only_first_row=i == 0) # Rearrange everything new_states = [] for j, m in enumerate(nmt_models): new_states.append([states[j][ind] for ind in indexes]) # new_states = ((states_m[ind] for ind in indexes) for states_m in states) all_outputs = all_outputs[:, indexes] all_masks = all_masks[:, indexes] all_costs = all_costs[:, indexes] # Record chosen output and compute new states states = [[ m.consume_next_(s, pred_id) for s, pred_id in zip(m_new_states, outputs) ] for m, m_new_states in zip(nmt_models, new_states)] all_outputs = np.vstack([all_outputs, outputs[None, :]]) logging.debug(u'all_outputs: {}'.format(all_outputs)) logging.debug(u'outputs: {}'.format( [utils.apply_trg_wmap([c]) for c in outputs])) logging.debug(u'indexes: {}'.format(indexes)) logging.debug(u'chosen_costs: {}'.format(chosen_costs)) logging.debug(u'outputs != STOP: {}'.format(outputs != STOP)) all_costs = np.vstack([all_costs, chosen_costs[None, :]]) mask = outputs != STOP # if ignore_first_eol: # and i == 0: # mask[:] = 1 all_masks = np.vstack([all_masks, mask[None, :]]) logging.debug(u'last masks: {}'.format(all_masks[-1])) all_outputs = all_outputs[1:] # skipping first row of self.BEGIN logging.debug(u'outputs: {}'.format(all_outputs)) all_masks = all_masks[: -1] #? all_masks[:-1] # skipping first row of self.BEGIN and the last row of self.STOP logging.debug(u'masks: {}'.format(all_masks)) all_costs = all_costs[ 1:] - all_costs[: -1] #turn cumulative cost ito cost of each step #?actually the last row would suffice for us? result = all_outputs, all_masks, all_costs trans, costs = DynetNMTVanillaDecoder.result_to_lists( result) #(nmt_vocab,result) logging.debug(u'trans: {}'.format(trans)) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]] hypo.score_breakdown[0] = [(-costs[idx], 1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.beam_size logging.debug(u'hypos: {}'.format(all_outputs)) return hypos
def do_decode(decoder, output_handlers, src_sentences): """This method contains the main decoding loop. It iterates through ``src_sentences`` and applies ``decoder.decode()`` to each of them. At the end, it calls the output handlers to create output files. Args: decoder (Decoder): Current decoder instance output_handlers (list): List of output handlers, see ``create_output_handlers()`` src_sentences (list): A list of strings. The strings are the source sentences with word indices to translate (e.g. '1 123 432 2') """ if not decoder.has_predictors(): logging.fatal("Decoding cancelled because of an error in the " "predictor configuration.") return start_time = time.time() logging.info("Start time: %s" % start_time) all_hypos = [] text_output_handler = get_text_output_handler(output_handlers) if text_output_handler: text_output_handler.open_file() for sen_idx in _get_sentence_indices(args.range, src_sentences): try: if src_sentences is False: src = "0" logging.info("Next sentence (ID: %d)" % (sen_idx + 1)) else: src = src_sentences[sen_idx] if isinstance(src[0], list): src_lst = [] for idx in xrange(len(src)): logging.info("Next sentence, input %d (ID: %d): %s" % (idx, sen_idx + 1, ' '.join(src[idx]))) src_lst.append([int(x) for x in src[idx]]) src = src_lst else: logging.info("Next sentence (ID: %d): %s" % (sen_idx + 1, ' '.join(src))) src = [int(x) for x in src] start_hypo_time = time.time() decoder.apply_predictors_count = 0 if isinstance(src[0], list): # don't apply wordmap for multiple inputs hypos = [ hypo for hypo in decoder.decode(src) if hypo.total_score > args.min_score ] else: hypos = [ hypo for hypo in decoder.decode(utils.apply_src_wmap(src)) if hypo.total_score > args.min_score ] if not hypos: logging.error("No translation found for ID %d!" % (sen_idx + 1)) logging.info("Stats (ID: %d): score=<not-found> " "num_expansions=%d " "time=%.2f" % (sen_idx + 1, decoder.apply_predictors_count, time.time() - start_hypo_time)) if text_output_handler: text_output_handler.write_empty_line() continue if args.remove_eos: for hypo in hypos: if (hypo.trgt_sentence and hypo.trgt_sentence[-1] == utils.EOS_ID): hypo.trgt_sentence = hypo.trgt_sentence[:-1] if args.nbest > 0: hypos = hypos[:args.nbest] if (args.combination_scheme != 'sum' and not args.apply_combination_scheme_to_partial_hypos): for hypo in hypos: hypo.total_score = core.breakdown2score_full( hypo.total_score, hypo.score_breakdown) hypos.sort(key=lambda hypo: hypo.total_score, reverse=True) if utils.trg_cmap: hypos = [ h.convert_to_char_level(utils.trg_cmap) for h in hypos ] logging.info( "Decoded (ID: %d): %s" % (sen_idx + 1, utils.apply_trg_wmap(hypos[0].trgt_sentence, {} if utils.trg_cmap else utils.trg_wmap))) logging.info("Stats (ID: %d): score=%f " "num_expansions=%d " "time=%.2f" % (sen_idx + 1, hypos[0].total_score, decoder.apply_predictors_count, time.time() - start_hypo_time)) all_hypos.append(hypos) try: # Write text output as we go if text_output_handler: text_output_handler.write_hypos([hypos]) except IOError as e: logging.error( "I/O error %d occurred when creating output files: %s" % (sys.exc_info()[0], e)) except ValueError as e: logging.error("Number format error at sentence id %d: %s, " "Stack trace: %s" % (sen_idx + 1, e, traceback.format_exc())) except Exception as e: logging.error( "An unexpected %s error has occurred at sentence id " "%d: %s, Stack trace: %s" % (sys.exc_info()[0], sen_idx + 1, e, traceback.format_exc())) try: for output_handler in output_handlers: if output_handler == text_output_handler: output_handler.close_file() else: output_handler.write_hypos(all_hypos) except IOError as e: logging.error("I/O error %s occurred when creating output files: %s" % (sys.exc_info()[0], e)) logging.info("Decoding finished. Time: %.2f" % (time.time() - start_time))
def _evaluate_model(self): """Evaluate model and store checkpoints. """ logging.info("Started Validation: ") val_start_time = time.time() total_cost = 0.0 if self.verbose: ftrans = codecs.open(self.config['val_set_out'], 'w', 'utf-8') for i, line in enumerate(self.data_stream.get_epoch_iterator()): seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk( line[0], self.config['src_vocab_size'])) if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = numpy.transpose( numpy.tile(seq, (self.config['beam_size'], 1, 1)), (2,0,1)) else: # word ids on the source side input_ = numpy.tile(seq, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3*len(line[0]), eol_symbol=utils.EOS_ID, ignore_first_eol=True) # if i < 10: # logging.info("ID: {}".format(i)) # logging.info("Source: {}".format(line[0])) # for k, tran in enumerate(trans): # logging.info(u"{}".format(utils.apply_trg_wmap(tran,self.trg_wmap))) # logging.info("{}".format(costs[k])) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans = trans[best] if trans and trans[-1] == utils.EOS_ID: trans = trans[:-1] trans_out = ' '.join([str(w) for w in trans]) except ValueError: logging.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' trans = 0 if j == 0: # Write to subprocess and file if it exists ##print(trans_out, file=mb_subprocess.stdin) if self.verbose: print(utils.apply_trg_wmap(trans,self.trg_wmap), file=ftrans) if i != 0 and i % 100 == 0: logging.info( "Translated {} lines of validation set...".format(i)) logging.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() if self.verbose: ftrans.close() logging.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) logger.info("{} {} {} {}".format(self.config['bleu_script'], self.config['val_set_out'], self.config['val_set_grndtruth'], self.config['results_out'])) bleu_score = float(subprocess.check_output("python2.7 {} {} {} {}".format(self.config['bleu_script'], self.config['val_set_out'], self.config['val_set_grndtruth'], self.config['results_out']), shell=True).decode("utf-8")) self.val_bleu_curve.append(bleu_score) logging.info(bleu_score) return bleu_score
def do_decode(decoder, output_handlers, src_sentences): """This method contains the main decoding loop. It iterates through ``src_sentences`` and applies ``decoder.decode()`` to each of them. At the end, it calls the output handlers to create output files. Args: decoder (Decoder): Current decoder instance output_handlers (list): List of output handlers, see ``create_output_handlers()`` src_sentences (list): A list of strings. The strings are the source sentences with word indices to translate (e.g. '1 123 432 2') """ if not decoder.has_predictors(): logging.fatal("Terminated due to an error in the " "predictor configuration.") return all_hypos = [] text_output_handler = _get_text_output_handler(output_handlers) if text_output_handler: text_output_handler.open_file() start_time = time.time() logging.info("Start time: %s" % start_time) sen_indices = [] for sen_idx in get_sentence_indices(args.range, src_sentences): decoder.set_current_sen_id(sen_idx) try: if src_sentences is False: src = "0" logging.info("Next sentence (ID: %d)" % (sen_idx + 1)) else: src = src_sentences[sen_idx] if len(src) > 0 and args.per_sentence_predictor_weights: # change predictor weights per-sentence weights = src[-1].split(',') if len(weights) > 1: weights = [float(x) for x in weights] src = src[:-1] logging.info('Changing predictor weights to {}'.format( weights)) decoder.change_predictor_weights(weights) else: logging.info( 'No weights read in {} - leaving unchanged'.format( src)) logging.info("Next sentence (ID: %d): %s" % (sen_idx + 1, ' '.join(src))) src = [int(x) for x in src] start_hypo_time = time.time() decoder.apply_predictors_count = 0 hypos = [hypo for hypo in decoder.decode(utils.apply_src_wmap(src)) if hypo.total_score > args.min_score] if not hypos: logging.error("No translation found for ID %d!" % (sen_idx+1)) logging.info("Stats (ID: %d): score=<not-found> " "num_expansions=%d " "time=%.2f" % (sen_idx+1, decoder.apply_predictors_count, time.time() - start_hypo_time)) hypos = [_generate_dummy_hypo(decoder.predictors)] hypos = _postprocess_complete_hypos(hypos) if utils.trg_cmap: hypos = [h.convert_to_char_level(utils.trg_cmap) for h in hypos] logging.info("Decoded (ID: %d): %s" % ( sen_idx+1, utils.apply_trg_wmap(hypos[0].trgt_sentence, {} if utils.trg_cmap else utils.trg_wmap))) logging.info("Stats (ID: %d): score=%f " "num_expansions=%d " "time=%.2f" % (sen_idx+1, hypos[0].total_score, decoder.apply_predictors_count, time.time() - start_hypo_time)) all_hypos.append(hypos) sen_indices.append(sen_idx) try: # Write text output as we go if text_output_handler: text_output_handler.write_hypos([hypos]) except IOError as e: logging.error("I/O error %d occurred when creating output files: %s" % (sys.exc_info()[0], e)) except ValueError as e: logging.error("Number format error at sentence id %d: %s, " "Stack trace: %s" % (sen_idx+1, e, traceback.format_exc())) except AttributeError as e: logging.fatal("Attribute error at sentence id %d: %s. This often " "indicates an error in the predictor configuration " "which could not be detected in initialisation. " "Stack trace: %s" % (sen_idx+1, e, traceback.format_exc())) except Exception as e: logging.error("An unexpected %s error has occurred at sentence id " "%d: %s, Stack trace: %s" % (sys.exc_info()[0], sen_idx+1, e, traceback.format_exc())) logging.info("Decoding finished. Time: %.2f" % (time.time() - start_time)) try: for output_handler in output_handlers: if output_handler == text_output_handler: output_handler.close_file() else: output_handler.write_hypos(all_hypos, sen_indices) except IOError as e: logging.error("I/O error %s occurred when creating output files: %s" % (sys.exc_info()[0], e))
def decode(self, src_sentence): """Decodes a single source sentence. Note that the score breakdowns in returned hypotheses are only on the sentence level, not on the word level. For finer grained NMT scores you need to use the nmt predictor. ``src_sentence`` is a list of source word ids representing the source sentence without <S> or </S> symbols. As blocks expects to see </S>, this method adds it automatically. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ dy.renew_cg() logging.debug(u'src_sentence: {}'.format(src_sentence)) # MAX_PRED_SEQ_LEN = 30*len(src_sentence) MAX_PRED_SEQ_LEN = 30 logging.debug(u'MAX_PRED_SEQ_LEN: {}'.format(MAX_PRED_SEQ_LEN)) BEGIN = utils.GO_ID STOP = utils.EOS_ID logging.debug(u'BEGIN: {}, STOP: {}'.format(BEGIN, STOP)) beam_size = self.beam_size self.nmt_model.initialize(src_sentence) # ignore_first_eol=True states = [self.nmt_model.s] * beam_size # This array will store all generated outputs, including those from # previous step and those from already finished sequences. all_outputs = np.full(shape=(1, beam_size), fill_value=BEGIN, dtype=int) all_masks = np.ones_like( all_outputs, dtype=float) # whether predicted symbol is self.STOP all_costs = np.zeros_like( all_outputs, dtype=float) # the cumulative cost of predictions for i in range(MAX_PRED_SEQ_LEN): if all_masks[-1].sum() == 0: logging.debug(u'all_masks: {}'.format(all_masks)) break # We carefully hack values of the `logprobs` array to ensure # that all finished sequences are continued with `eos_symbol`. logprobs = -np.array( [self.nmt_model.predict_next_(s) for s in states]) # print logprobs # print all_masks[-1, :, None] next_costs = ( all_costs[-1, :, None] + logprobs * all_masks[-1, :, None] ) #take last row of cumul prev costs and turn into beam_size X 1 matrix, take logprobs distributions for unfinished hypos only and add it (elem-wise) with the array of prev costs; result: beam_size x vocab_len matrix of next costs (finished, ) = np.where( all_masks[-1] == 0 ) # finished hypos have all their cost on the self.STOP symbol next_costs[finished, :STOP] = np.inf next_costs[finished, STOP + 1:] = np.inf # indexes - the hypos from prev step to keep, outputs - the next step prediction, chosen cost - cost of predicted symbol (indexes, outputs), chosen_costs = self._smallest(next_costs, beam_size, only_first_row=i == 0) # print outputs # Rearrange everything new_states = (states[ind] for ind in indexes) all_outputs = all_outputs[:, indexes] all_masks = all_masks[:, indexes] all_costs = all_costs[:, indexes] # Record chosen output and compute new states states = [ self.nmt_model.consume_next_(s, pred_id) for s, pred_id in zip(new_states, outputs) ] all_outputs = np.vstack([all_outputs, outputs[None, :]]) logging.debug(u'all_outputs: {}'.format(all_outputs)) logging.debug(u'outputs: {}'.format( [utils.apply_trg_wmap([c]) for c in outputs])) logging.debug(u'indexes: {}'.format(indexes)) logging.debug(u'chosen_costs: {}'.format(chosen_costs)) logging.debug(u'outputs != STOP: {}'.format(outputs != STOP)) all_costs = np.vstack([all_costs, chosen_costs[None, :]]) mask = outputs != STOP # if ignore_first_eol: #and i == 0: # mask[:] = 1 all_masks = np.vstack([all_masks, mask[None, :]]) all_outputs = all_outputs[1:] # skipping first row of self.BEGIN logging.debug(u'outputs: {}'.format(all_outputs)) all_masks = all_masks[: -1] #? all_masks[:-1] # skipping first row of self.BEGIN and the last row of self.STOP logging.debug(u'masks: {}'.format(all_masks)) all_costs = all_costs[ 1:] - all_costs[: -1] #turn cumulative cost ito cost of each step #?actually the last row would suffice for us? result = all_outputs, all_masks, all_costs trans, costs = self.result_to_lists( result) #self.nmt_model.vocab, result) logging.debug(u'trans: {}'.format(trans)) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]] hypo.score_breakdown[0] = [(-costs[idx], 1.0)] hypos.append(hypo) logging.debug(u'hypos: {}'.format(all_outputs)) return hypos
def _expand_hypo_nmt(self, input_hypo): """Get the best beam size expansions of ``hypo`` by one MORPHEME based on nmt predictor scores only, i.e. expand hypo until all of the beam size best hypotheses end with ``sync_symb`` or EOS. The implementation relies on '_expand_hypo_nmt' of the parent class BeamDecoderSegm which provides best beam size expansions of ``hypo`` by one CHAR based on nmt predictor scores only. Args: hypo (PartialHypothesis): Hypothesis to expand Return: list. List of expanded hypotheses. """ # The input hypo to be expanded logging.debug(u"EXPAND: {} {}".format( utils.apply_trg_wmap(input_hypo.trgt_sentence), input_hypo.score)) # Get initial expansions by one char hypos = super(SyncBeamDecoderSegm, self)._expand_hypo_nmt(input_hypo) # input_hypo_len = len(input_hypo.score_breakdown) # Expand until all hypos are closed it = 0 while self._all_eos_or_eow(hypos): if it > self.max_morf_len: # prevent infinite loops break logging.debug(u"SYNC BEAM ITER: {}".format(it)) it = it + 1 next_hypos = [] next_scores = [] for hypo in hypos: # Combined predictors score for the chars in a next morpheme (we look for a best morpheme expansion of the input_hypo) next_score = sum([ sum([ char_scores[i][0] for i, s in enumerate(char_scores) if self.predictor_names[i] == "nmt" ]) for char_scores in hypo.score_breakdown ]) # next_score = sum([sum([char_scores[i][0] for i,s in enumerate(char_scores) if self.predictor_levels[i]=="c"]) for char_scores in hypo.score_breakdown]) logging.debug(u"CONTINUATION: {} -> {}, {}".format( utils.apply_trg_wmap(hypo.trgt_sentence), next_score, hypo.score)) if self._is_closed(hypo): next_hypos.append(hypo) next_scores.append(next_score) logging.debug(u"NOT EXPAND: {} -> {}, {}".format( utils.apply_trg_wmap(hypo.trgt_sentence), next_score, hypo.score)) continue for next_hypo in super(SyncBeamDecoderSegm, self)._expand_hypo_nmt(hypo): next_hypos.append(next_hypo) next_score = sum([ sum([ char_scores[i][0] for i, s in enumerate(char_scores) if self.predictor_names[i] == "nmt" ]) for char_scores in next_hypo.score_breakdown ]) # next_score = sum([sum([char_scores[i][0] for i,s in enumerate(char_scores) if self.predictor_levels[i]=="c"]) for char_scores in next_hypo.score_breakdown]) next_scores.append(next_score) logging.debug(u"EXPAND: {} -> {}, {}".format( utils.apply_trg_wmap(next_hypo.trgt_sentence), next_score, next_hypo.score)) logging.debug(u"BEFORE CUT on ITERATION: {} -> {}".format( it, " && ".join( utils.apply_trg_wmap(h.trgt_sentence) + ", " + str(next_scores[i]) for i, h in enumerate(next_hypos)))) hypos = self._get_next_hypos(next_hypos, next_scores) logging.debug(u"CUT: {}".format(" && ".join( utils.apply_trg_wmap(h.trgt_sentence) for h in hypos))) # Best final expansion of the initial hypo by morphemes for hypo in hypos: logging.debug(u"SYNCRESULT {} {}".format( utils.apply_trg_wmap(hypo.trgt_sentence), sum([ sum([ char_scores[i][0] for i, s in enumerate(char_scores) if self.predictor_names[i] == "nmt" ]) for char_scores in hypo.score_breakdown ]))) # logging.debug(u"SYNCRESULT {} {}".format(utils.apply_trg_wmap(hypo.trgt_sentence), sum([sum([char_scores[i][0] for i,s in enumerate(char_scores) if self.predictor_levels[i]=="c"]) for char_scores in hypo.score_breakdown]))) return hypos
def decode(self, src_sentence): """Decodes a single source sentence using beam search. Expands (beam size) hypotheses based on a sum of nmt predictors scores (_expand_hypo_nmt), cuts (beam size) the resulting continuation based on a combined predictors score.""" dy.renew_cg() self.initialize_predictors(src_sentence) hypos = self._get_initial_hypos() self.setup_max_len(src_sentence) logging.debug(u"Source len {}".format(len(src_sentence))) logging.debug(u"MAX-ITER: {}".format(self.max_len)) # Initial expansion for hypo in hypos: logging.debug(u"INIT {} {}".format( utils.apply_trg_wmap(hypo.trgt_sentence), hypo.score_breakdown)) it = 0 while self.stop_criterion(hypos): logging.debug(u"ITER: {}, MAX-ITER: {}".format(it, self.max_len)) if it > self.max_len: # prevent infinite loops break it = it + 1 next_hypos = [] next_scores = [] self.min_score = utils.NEG_INF self.best_scores = [] for hypo in hypos: if hypo.get_last_word() == utils.EOS_ID: next_hypos.append(hypo) next_scores.append(self._get_combined_score(hypo)) logging.debug(u"BEAM IT {} HYPO {} NO EXPAND".format( it, utils.apply_trg_wmap(hypo.trgt_sentence))) continue for next_hypo in self._expand_hypo_nmt(hypo): next_score = self._get_combined_score(next_hypo) if next_score > self.min_score: next_hypos.append(next_hypo) next_scores.append(next_score) self._register_score(next_score) logging.debug(u"BEAM IT {} HYPO {} -> NEXT HYPO {}".format( it, utils.apply_trg_wmap(hypo.trgt_sentence), utils.apply_trg_wmap(next_hypo.trgt_sentence))) # hypo expansions on this iteraion which will be cut (beam size) based on combined predictors score: logging.debug(u"BEAM IT {} NEXT HYPOS BEFORE CUT -> {}".format( it, " && ".join( utils.apply_trg_wmap(h.trgt_sentence) + ", " + str(next_scores[i]) for i, h in enumerate(next_hypos)))) logging.debug(u"BEAM IT {} Min score: {}".format( it, self.min_score)) if self.hypo_recombination: hypos = self._filter_equal_hypos(next_hypos, next_scores) else: hypos = self._get_next_hypos(next_hypos, next_scores) # Best (beam size) expansions of the hypo on this iteration... logging.debug(u"BEAM IT {} CUT: {}".format( it, " && ".join( utils.apply_trg_wmap(h.trgt_sentence) for h in hypos))) # ... with detailed scores per char for i, hypo in enumerate(hypos): logging.debug(u"BEAM IT {} :{}".format( utils.apply_trg_wmap(hypo.trgt_sentence), hypo.score)) for i, score_char in enumerate(hypo.score_breakdown): logging.debug(u"{}: {}".format( utils.apply_trg_wmap([hypo.trgt_sentence[i]]), ", ".join("{:.10f}".format(s) + ":" + "{:.2f}".format(w) for s, w in score_char))) # # final hypos # final_scores = [] # final_hypos = [] # for hypo in hypos: # final_hypos.append(hypo) # final_scores.append(hypo.score) # hypos = self._get_next_hypos(final_hypos, final_scores) # # # Best final hypos # logging.debug(u"BEAM FINAL: {}".format(" && ".join(utils.apply_trg_wmap(h.trgt_sentence) for h in hypos))) for hypo in hypos: if hypo.get_last_word() == utils.EOS_ID: self.add_full_hypo(hypo.generate_full_hypothesis()) if not self.full_hypos: logging.warn("No complete hypotheses found for %s" % src_sentence) for hypo in hypos: self.add_full_hypo(hypo.generate_full_hypothesis()) return self.get_full_hypos_sorted()