def prepare_sim_decode(decoder, output_handlers, src_sentences): """This method sets up the decoding procedure, and stops after reading the first words. Args are the same as ``do_decode()'' Returns: all_hypos: List of all initial hypotheses in the train/test set all_src: List of all source sentences(in ids) in the train/test set """ if not decoder.has_predictors(): logging.fatal("Decoding cancelled because of an error in the " "predictor configuration.") return all_hypos = [] all_src = [] for sen_idx in get_sentence_indices(args.range, src_sentences): try: if src_sentences is False: src = "0" logging.info("Next sentence (ID: %d)" % (sen_idx + 1)) else: src = src_sentences[sen_idx] if isinstance(src[0], list): src_lst = [] for idx in xrange(len(src)): logging.info("Next sentence, input %d (ID: %d): %s" % (idx, sen_idx + 1, ' '.join(src[idx]))) src_lst.append([int(x) for x in src[idx]]) src = src_lst else: # logging.info("Next sentence (ID: %d): %s" % (sen_idx + 1, # ' '.join(src))) src = [int(x) for x in src] # get the set of initial hypotheses if isinstance(src[0], list): # Don't apply wordmap for multiple inputs hypos = decoder.prepare_sim_decode(src) else: all_src.append(utils.apply_src_wmap(src)) hypos = decoder.prepare_sim_decode(all_src[-1], len(all_src) - 1) all_hypos.append(hypos) #all_src.append(utils.apply_src_wmap(src))) except ValueError as e: logging.error("Number format error at sentence id %d: %s, " "Stack trace: %s" % (sen_idx + 1, e, traceback.format_exc())) except Exception as e: logging.error( "An unexpected %s error has occurred at sentence id " "%d: %s, Stack trace: %s" % (sys.exc_info()[0], sen_idx + 1, e, traceback.format_exc())) return all_hypos, all_src
def do_decode(decoder, output_handlers, src_sentences): """This method contains the main decoding loop. It iterates through ``src_sentences`` and applies ``decoder.decode()`` to each of them. At the end, it calls the output handlers to create output files. Args: decoder (Decoder): Current decoder instance output_handlers (list): List of output handlers, see ``create_output_handlers()`` src_sentences (list): A list of strings. The strings are the source sentences with word indices to translate (e.g. '1 123 432 2') """ if not decoder.has_predictors(): logging.fatal("Decoding cancelled because of an error in the " "predictor configuration.") return start_time = time.time() logging.info("Start time: %s" % start_time) all_hypos = [] text_output_handler = get_text_output_handler(output_handlers) if text_output_handler: text_output_handler.open_file() for sen_idx in _get_sentence_indices(args.range, src_sentences): try: if src_sentences is False: src = "0" logging.info("Next sentence (ID: %d)" % (sen_idx + 1)) else: src = src_sentences[sen_idx] if isinstance(src[0], list): src_lst = [] for idx in xrange(len(src)): logging.info("Next sentence, input %d (ID: %d): %s" % (idx, sen_idx + 1, ' '.join(src[idx]))) src_lst.append([int(x) for x in src[idx]]) src = src_lst else: logging.info("Next sentence (ID: %d): %s" % (sen_idx + 1, ' '.join(src))) src = [int(x) for x in src] start_hypo_time = time.time() decoder.apply_predictors_count = 0 if isinstance(src[0], list): # don't apply wordmap for multiple inputs hypos = [ hypo for hypo in decoder.decode(src) if hypo.total_score > args.min_score ] else: hypos = [ hypo for hypo in decoder.decode(utils.apply_src_wmap(src)) if hypo.total_score > args.min_score ] if not hypos: logging.error("No translation found for ID %d!" % (sen_idx + 1)) logging.info("Stats (ID: %d): score=<not-found> " "num_expansions=%d " "time=%.2f" % (sen_idx + 1, decoder.apply_predictors_count, time.time() - start_hypo_time)) if text_output_handler: text_output_handler.write_empty_line() continue if args.remove_eos: for hypo in hypos: if (hypo.trgt_sentence and hypo.trgt_sentence[-1] == utils.EOS_ID): hypo.trgt_sentence = hypo.trgt_sentence[:-1] if args.nbest > 0: hypos = hypos[:args.nbest] if (args.combination_scheme != 'sum' and not args.apply_combination_scheme_to_partial_hypos): for hypo in hypos: hypo.total_score = core.breakdown2score_full( hypo.total_score, hypo.score_breakdown) hypos.sort(key=lambda hypo: hypo.total_score, reverse=True) if utils.trg_cmap: hypos = [ h.convert_to_char_level(utils.trg_cmap) for h in hypos ] logging.info( "Decoded (ID: %d): %s" % (sen_idx + 1, utils.apply_trg_wmap(hypos[0].trgt_sentence, {} if utils.trg_cmap else utils.trg_wmap))) logging.info("Stats (ID: %d): score=%f " "num_expansions=%d " "time=%.2f" % (sen_idx + 1, hypos[0].total_score, decoder.apply_predictors_count, time.time() - start_hypo_time)) all_hypos.append(hypos) try: # Write text output as we go if text_output_handler: text_output_handler.write_hypos([hypos]) except IOError as e: logging.error( "I/O error %d occurred when creating output files: %s" % (sys.exc_info()[0], e)) except ValueError as e: logging.error("Number format error at sentence id %d: %s, " "Stack trace: %s" % (sen_idx + 1, e, traceback.format_exc())) except Exception as e: logging.error( "An unexpected %s error has occurred at sentence id " "%d: %s, Stack trace: %s" % (sys.exc_info()[0], sen_idx + 1, e, traceback.format_exc())) try: for output_handler in output_handlers: if output_handler == text_output_handler: output_handler.close_file() else: output_handler.write_hypos(all_hypos) except IOError as e: logging.error("I/O error %s occurred when creating output files: %s" % (sys.exc_info()[0], e)) logging.info("Decoding finished. Time: %.2f" % (time.time() - start_time))
def do_decode(decoder, output_handlers, src_sentences): """This method contains the main decoding loop. It iterates through ``src_sentences`` and applies ``decoder.decode()`` to each of them. At the end, it calls the output handlers to create output files. Args: decoder (Decoder): Current decoder instance output_handlers (list): List of output handlers, see ``create_output_handlers()`` src_sentences (list): A list of strings. The strings are the source sentences with word indices to translate (e.g. '1 123 432 2') """ if not decoder.has_predictors(): logging.fatal("Terminated due to an error in the " "predictor configuration.") return all_hypos = [] text_output_handler = _get_text_output_handler(output_handlers) if text_output_handler: text_output_handler.open_file() start_time = time.time() logging.info("Start time: %s" % start_time) sen_indices = [] for sen_idx in get_sentence_indices(args.range, src_sentences): decoder.set_current_sen_id(sen_idx) try: if src_sentences is False: src = "0" logging.info("Next sentence (ID: %d)" % (sen_idx + 1)) else: src = src_sentences[sen_idx] if len(src) > 0 and args.per_sentence_predictor_weights: # change predictor weights per-sentence weights = src[-1].split(',') if len(weights) > 1: weights = [float(x) for x in weights] src = src[:-1] logging.info('Changing predictor weights to {}'.format( weights)) decoder.change_predictor_weights(weights) else: logging.info( 'No weights read in {} - leaving unchanged'.format( src)) logging.info("Next sentence (ID: %d): %s" % (sen_idx + 1, ' '.join(src))) src = [int(x) for x in src] start_hypo_time = time.time() decoder.apply_predictors_count = 0 hypos = [hypo for hypo in decoder.decode(utils.apply_src_wmap(src)) if hypo.total_score > args.min_score] if not hypos: logging.error("No translation found for ID %d!" % (sen_idx+1)) logging.info("Stats (ID: %d): score=<not-found> " "num_expansions=%d " "time=%.2f" % (sen_idx+1, decoder.apply_predictors_count, time.time() - start_hypo_time)) hypos = [_generate_dummy_hypo(decoder.predictors)] hypos = _postprocess_complete_hypos(hypos) if utils.trg_cmap: hypos = [h.convert_to_char_level(utils.trg_cmap) for h in hypos] logging.info("Decoded (ID: %d): %s" % ( sen_idx+1, utils.apply_trg_wmap(hypos[0].trgt_sentence, {} if utils.trg_cmap else utils.trg_wmap))) logging.info("Stats (ID: %d): score=%f " "num_expansions=%d " "time=%.2f" % (sen_idx+1, hypos[0].total_score, decoder.apply_predictors_count, time.time() - start_hypo_time)) all_hypos.append(hypos) sen_indices.append(sen_idx) try: # Write text output as we go if text_output_handler: text_output_handler.write_hypos([hypos]) except IOError as e: logging.error("I/O error %d occurred when creating output files: %s" % (sys.exc_info()[0], e)) except ValueError as e: logging.error("Number format error at sentence id %d: %s, " "Stack trace: %s" % (sen_idx+1, e, traceback.format_exc())) except AttributeError as e: logging.fatal("Attribute error at sentence id %d: %s. This often " "indicates an error in the predictor configuration " "which could not be detected in initialisation. " "Stack trace: %s" % (sen_idx+1, e, traceback.format_exc())) except Exception as e: logging.error("An unexpected %s error has occurred at sentence id " "%d: %s, Stack trace: %s" % (sys.exc_info()[0], sen_idx+1, e, traceback.format_exc())) logging.info("Decoding finished. Time: %.2f" % (time.time() - start_time)) try: for output_handler in output_handlers: if output_handler == text_output_handler: output_handler.close_file() else: output_handler.write_hypos(all_hypos, sen_indices) except IOError as e: logging.error("I/O error %s occurred when creating output files: %s" % (sys.exc_info()[0], e))