Пример #1
0
def prepare_sim_decode(decoder, output_handlers, src_sentences):
    """This method sets up the decoding procedure, and stops after reading the
    first words. Args are the same as ``do_decode()''

    Returns:
        all_hypos:  List of all initial hypotheses in the train/test set
        all_src:    List of all source sentences(in ids) in the train/test set
    """
    if not decoder.has_predictors():
        logging.fatal("Decoding cancelled because of an error in the "
                      "predictor configuration.")
        return
    all_hypos = []
    all_src = []

    for sen_idx in get_sentence_indices(args.range, src_sentences):
        try:
            if src_sentences is False:
                src = "0"
                logging.info("Next sentence (ID: %d)" % (sen_idx + 1))
            else:
                src = src_sentences[sen_idx]
                if isinstance(src[0], list):
                    src_lst = []
                    for idx in xrange(len(src)):
                        logging.info("Next sentence, input %d (ID: %d): %s" %
                                     (idx, sen_idx + 1, ' '.join(src[idx])))
                        src_lst.append([int(x) for x in src[idx]])
                    src = src_lst
                else:
                    # logging.info("Next sentence (ID: %d): %s" % (sen_idx + 1,
                    #                                              ' '.join(src)))
                    src = [int(x) for x in src]
            # get the set of initial hypotheses
            if isinstance(src[0], list):
                # Don't apply wordmap for multiple inputs
                hypos = decoder.prepare_sim_decode(src)
            else:
                all_src.append(utils.apply_src_wmap(src))
                hypos = decoder.prepare_sim_decode(all_src[-1],
                                                   len(all_src) - 1)

            all_hypos.append(hypos)
            #all_src.append(utils.apply_src_wmap(src)))

        except ValueError as e:
            logging.error("Number format error at sentence id %d: %s, "
                          "Stack trace: %s" %
                          (sen_idx + 1, e, traceback.format_exc()))
        except Exception as e:
            logging.error(
                "An unexpected %s error has occurred at sentence id "
                "%d: %s, Stack trace: %s" %
                (sys.exc_info()[0], sen_idx + 1, e, traceback.format_exc()))

    return all_hypos, all_src
Пример #2
0
def do_decode(decoder, output_handlers, src_sentences):
    """This method contains the main decoding loop. It iterates through
    ``src_sentences`` and applies ``decoder.decode()`` to each of them.
    At the end, it calls the output handlers to create output files.
    
    Args:
        decoder (Decoder):  Current decoder instance
        output_handlers (list):  List of output handlers, see
                                 ``create_output_handlers()``
        src_sentences (list):  A list of strings. The strings are the
                               source sentences with word indices to 
                               translate (e.g. '1 123 432 2')
    """
    if not decoder.has_predictors():
        logging.fatal("Decoding cancelled because of an error in the "
                      "predictor configuration.")
        return
    start_time = time.time()
    logging.info("Start time: %s" % start_time)
    all_hypos = []
    text_output_handler = get_text_output_handler(output_handlers)
    if text_output_handler:
        text_output_handler.open_file()
    for sen_idx in _get_sentence_indices(args.range, src_sentences):
        try:
            if src_sentences is False:
                src = "0"
                logging.info("Next sentence (ID: %d)" % (sen_idx + 1))
            else:
                src = src_sentences[sen_idx]
                if isinstance(src[0], list):
                    src_lst = []
                    for idx in xrange(len(src)):
                        logging.info("Next sentence, input %d (ID: %d): %s" %
                                     (idx, sen_idx + 1, ' '.join(src[idx])))
                        src_lst.append([int(x) for x in src[idx]])
                    src = src_lst
                else:
                    logging.info("Next sentence (ID: %d): %s" %
                                 (sen_idx + 1, ' '.join(src)))
                    src = [int(x) for x in src]
            start_hypo_time = time.time()
            decoder.apply_predictors_count = 0
            if isinstance(src[0], list):
                # don't apply wordmap for multiple inputs
                hypos = [
                    hypo for hypo in decoder.decode(src)
                    if hypo.total_score > args.min_score
                ]
            else:
                hypos = [
                    hypo for hypo in decoder.decode(utils.apply_src_wmap(src))
                    if hypo.total_score > args.min_score
                ]
            if not hypos:
                logging.error("No translation found for ID %d!" %
                              (sen_idx + 1))
                logging.info("Stats (ID: %d): score=<not-found> "
                             "num_expansions=%d "
                             "time=%.2f" %
                             (sen_idx + 1, decoder.apply_predictors_count,
                              time.time() - start_hypo_time))
                if text_output_handler:
                    text_output_handler.write_empty_line()
                continue
            if args.remove_eos:
                for hypo in hypos:
                    if (hypo.trgt_sentence
                            and hypo.trgt_sentence[-1] == utils.EOS_ID):
                        hypo.trgt_sentence = hypo.trgt_sentence[:-1]
            if args.nbest > 0:
                hypos = hypos[:args.nbest]
            if (args.combination_scheme != 'sum'
                    and not args.apply_combination_scheme_to_partial_hypos):
                for hypo in hypos:
                    hypo.total_score = core.breakdown2score_full(
                        hypo.total_score, hypo.score_breakdown)
                hypos.sort(key=lambda hypo: hypo.total_score, reverse=True)
            if utils.trg_cmap:
                hypos = [
                    h.convert_to_char_level(utils.trg_cmap) for h in hypos
                ]
            logging.info(
                "Decoded (ID: %d): %s" %
                (sen_idx + 1,
                 utils.apply_trg_wmap(hypos[0].trgt_sentence, {}
                                      if utils.trg_cmap else utils.trg_wmap)))
            logging.info("Stats (ID: %d): score=%f "
                         "num_expansions=%d "
                         "time=%.2f" % (sen_idx + 1, hypos[0].total_score,
                                        decoder.apply_predictors_count,
                                        time.time() - start_hypo_time))
            all_hypos.append(hypos)
            try:
                # Write text output as we go
                if text_output_handler:
                    text_output_handler.write_hypos([hypos])
            except IOError as e:
                logging.error(
                    "I/O error %d occurred when creating output files: %s" %
                    (sys.exc_info()[0], e))
        except ValueError as e:
            logging.error("Number format error at sentence id %d: %s, "
                          "Stack trace: %s" %
                          (sen_idx + 1, e, traceback.format_exc()))
        except Exception as e:
            logging.error(
                "An unexpected %s error has occurred at sentence id "
                "%d: %s, Stack trace: %s" %
                (sys.exc_info()[0], sen_idx + 1, e, traceback.format_exc()))
    try:
        for output_handler in output_handlers:
            if output_handler == text_output_handler:
                output_handler.close_file()
            else:
                output_handler.write_hypos(all_hypos)
    except IOError as e:
        logging.error("I/O error %s occurred when creating output files: %s" %
                      (sys.exc_info()[0], e))
    logging.info("Decoding finished. Time: %.2f" % (time.time() - start_time))
Пример #3
0
def do_decode(decoder, 
              output_handlers, 
              src_sentences):
    """This method contains the main decoding loop. It iterates through
    ``src_sentences`` and applies ``decoder.decode()`` to each of them.
    At the end, it calls the output handlers to create output files.
    
    Args:
        decoder (Decoder):  Current decoder instance
        output_handlers (list):  List of output handlers, see
                                 ``create_output_handlers()``
        src_sentences (list):  A list of strings. The strings are the
                               source sentences with word indices to 
                               translate (e.g. '1 123 432 2')
    """
    if not decoder.has_predictors():
        logging.fatal("Terminated due to an error in the "
                      "predictor configuration.")
        return
    all_hypos = []
    text_output_handler = _get_text_output_handler(output_handlers)
    if text_output_handler:
        text_output_handler.open_file()
    start_time = time.time()
    logging.info("Start time: %s" % start_time)
    sen_indices = []
    for sen_idx in get_sentence_indices(args.range, src_sentences):
        decoder.set_current_sen_id(sen_idx)
        try:
            if src_sentences is False:
                src = "0"
                logging.info("Next sentence (ID: %d)" % (sen_idx + 1))
            else:
                src = src_sentences[sen_idx]
            if len(src) > 0 and args.per_sentence_predictor_weights:
                # change predictor weights per-sentence
                weights = src[-1].split(',')
                if len(weights) > 1:
                    weights = [float(x) for x in weights]
                    src = src[:-1]
                    logging.info('Changing predictor weights to {}'.format(
                        weights))
                    decoder.change_predictor_weights(weights)
                else:
                    logging.info(
                        'No weights read in {} - leaving unchanged'.format(
                            src))
            logging.info("Next sentence (ID: %d): %s" % (sen_idx + 1, ' '.join(src)))
            src = [int(x) for x in src]
            start_hypo_time = time.time()
            decoder.apply_predictors_count = 0
            hypos = [hypo 
                     for hypo in decoder.decode(utils.apply_src_wmap(src))
                        if hypo.total_score > args.min_score]
            if not hypos:
                logging.error("No translation found for ID %d!" % (sen_idx+1))
                logging.info("Stats (ID: %d): score=<not-found> "
                         "num_expansions=%d "
                         "time=%.2f" % (sen_idx+1,
                                        decoder.apply_predictors_count,
                                        time.time() - start_hypo_time))
                hypos = [_generate_dummy_hypo(decoder.predictors)]
            hypos = _postprocess_complete_hypos(hypos)
            if utils.trg_cmap:
                hypos = [h.convert_to_char_level(utils.trg_cmap) for h in hypos]
            logging.info("Decoded (ID: %d): %s" % (
                    sen_idx+1,
                    utils.apply_trg_wmap(hypos[0].trgt_sentence, 
                                         {} if utils.trg_cmap else utils.trg_wmap)))
            logging.info("Stats (ID: %d): score=%f "
                         "num_expansions=%d "
                         "time=%.2f" % (sen_idx+1,
                                        hypos[0].total_score,
                                        decoder.apply_predictors_count,
                                        time.time() - start_hypo_time))
            all_hypos.append(hypos)
            sen_indices.append(sen_idx)
            try:
                # Write text output as we go
                if text_output_handler:
                    text_output_handler.write_hypos([hypos])
            except IOError as e:
                logging.error("I/O error %d occurred when creating output files: %s"
                            % (sys.exc_info()[0], e))
        except ValueError as e:
            logging.error("Number format error at sentence id %d: %s, "
                          "Stack trace: %s" % (sen_idx+1, 
                                               e,
                                               traceback.format_exc()))
        except AttributeError as e:
            logging.fatal("Attribute error at sentence id %d: %s. This often "
                          "indicates an error in the predictor configuration "
                          "which could not be detected in initialisation. "
                          "Stack trace: %s" 
                          % (sen_idx+1, e, traceback.format_exc()))
        except Exception as e:
            logging.error("An unexpected %s error has occurred at sentence id "
                          "%d: %s, Stack trace: %s" % (sys.exc_info()[0],
                                                       sen_idx+1,
                                                       e,
                                                       traceback.format_exc()))
    logging.info("Decoding finished. Time: %.2f" % (time.time() - start_time))
    try:
        for output_handler in output_handlers:
            if output_handler == text_output_handler:
                output_handler.close_file()
            else:
                output_handler.write_hypos(all_hypos, sen_indices)
    except IOError as e:
        logging.error("I/O error %s occurred when creating output files: %s"
                      % (sys.exc_info()[0], e))
Пример #4
0
def do_decode(decoder, 
              output_handlers, 
              src_sentences):
    """This method contains the main decoding loop. It iterates through
    ``src_sentences`` and applies ``decoder.decode()`` to each of them.
    At the end, it calls the output handlers to create output files.
    
    Args:
        decoder (Decoder):  Current decoder instance
        output_handlers (list):  List of output handlers, see
                                 ``create_output_handlers()``
        src_sentences (list):  A list of strings. The strings are the
                               source sentences with word indices to 
                               translate (e.g. '1 123 432 2')
    """
    if not decoder.has_predictors():
        logging.fatal("Terminated due to an error in the "
                      "predictor configuration.")
        return
    all_hypos = []
    text_output_handler = _get_text_output_handler(output_handlers)
    if text_output_handler:
        text_output_handler.open_file()
    start_time = time.time()
    logging.info("Start time: %s" % start_time)
    sen_indices = []
    for sen_idx in get_sentence_indices(args.range, src_sentences):
        decoder.set_current_sen_id(sen_idx)
        try:
            if src_sentences is False:
                src = "0"
                logging.info("Next sentence (ID: %d)" % (sen_idx + 1))
            else:
                src = src_sentences[sen_idx]
            if len(src) > 0 and args.per_sentence_predictor_weights:
                # change predictor weights per-sentence
                weights = src[-1].split(',')
                if len(weights) > 1:
                    weights = [float(x) for x in weights]
                    src = src[:-1]
                    logging.info('Changing predictor weights to {}'.format(
                        weights))
                    decoder.change_predictor_weights(weights)
                else:
                    logging.info(
                        'No weights read in {} - leaving unchanged'.format(
                            src))
            logging.info("Next sentence (ID: %d): %s" % (sen_idx + 1, ' '.join(src)))
            src = [int(x) for x in src]
            start_hypo_time = time.time()
            decoder.apply_predictors_count = 0
            hypos = [hypo 
                     for hypo in decoder.decode(utils.apply_src_wmap(src))
                        if hypo.total_score > args.min_score]
            if not hypos:
                logging.error("No translation found for ID %d!" % (sen_idx+1))
                logging.info("Stats (ID: %d): score=<not-found> "
                         "num_expansions=%d "
                         "time=%.2f" % (sen_idx+1,
                                        decoder.apply_predictors_count,
                                        time.time() - start_hypo_time))
                hypos = [_generate_dummy_hypo(decoder.predictors)]
            hypos = _postprocess_complete_hypos(hypos)
            if utils.trg_cmap:
                hypos = [h.convert_to_char_level(utils.trg_cmap) for h in hypos]
            logging.info("Decoded (ID: %d): %s" % (
                    sen_idx+1,
                    utils.apply_trg_wmap(hypos[0].trgt_sentence, 
                                         {} if utils.trg_cmap else utils.trg_wmap)))
            logging.info("Stats (ID: %d): score=%f "
                         "num_expansions=%d "
                         "time=%.2f" % (sen_idx+1,
                                        hypos[0].total_score,
                                        decoder.apply_predictors_count,
                                        time.time() - start_hypo_time))
            all_hypos.append(hypos)
            sen_indices.append(sen_idx)
            try:
                # Write text output as we go
                if text_output_handler:
                    text_output_handler.write_hypos([hypos])
            except IOError as e:
                logging.error("I/O error %d occurred when creating output files: %s"
                            % (sys.exc_info()[0], e))
        except ValueError as e:
            logging.error("Number format error at sentence id %d: %s, "
                          "Stack trace: %s" % (sen_idx+1, 
                                               e,
                                               traceback.format_exc()))
        except AttributeError as e:
            logging.fatal("Attribute error at sentence id %d: %s. This often "
                          "indicates an error in the predictor configuration "
                          "which could not be detected in initialisation. "
                          "Stack trace: %s" 
                          % (sen_idx+1, e, traceback.format_exc()))
        except Exception as e:
            logging.error("An unexpected %s error has occurred at sentence id "
                          "%d: %s, Stack trace: %s" % (sys.exc_info()[0],
                                                       sen_idx+1,
                                                       e,
                                                       traceback.format_exc()))
    logging.info("Decoding finished. Time: %.2f" % (time.time() - start_time))
    try:
        for output_handler in output_handlers:
            if output_handler == text_output_handler:
                output_handler.close_file()
            else:
                output_handler.write_hypos(all_hypos, sen_indices)
    except IOError as e:
        logging.error("I/O error %s occurred when creating output files: %s"
                      % (sys.exc_info()[0], e))