예제 #1
0
def retrieve_gold_standard(pipeline_options, is_wmt18_format):
    golds = {}
    if pipeline_options.gold_target:
        gold_target = _wmt_to_labels(read_file(pipeline_options.gold_target))
        if is_wmt18_format:
            gold_target, gold_gaps = _split_wmt18(gold_target)
            golds[const.GAP_TAGS] = gold_gaps
        golds[const.TARGET_TAGS] = gold_target
    # handling of gold source
    if pipeline_options.gold_source:
        gold_source = _wmt_to_labels(read_file(pipeline_options.gold_source))
        golds[const.SOURCE_TAGS] = gold_source
    # handling of gold sentences
    if pipeline_options.gold_sents:
        gold_sentences = _read_sentence_scores(pipeline_options.gold_sents)
        golds[const.SENTENCE_SCORES] = gold_sentences
    return golds
예제 #2
0
def retrieve_predictions(pipeline_options, is_wmt18_pred_format):
    pred_files = {target: [] for target in const.TARGETS}
    if pipeline_options.pred_target:
        for pred_file in pipeline_options.pred_target:
            pred_target = read_file(pred_file)
            if is_wmt18_pred_format:
                pred_target, pred_gaps = _split_wmt18(pred_target)
                pred_files[const.GAP_TAGS].append((str(pred_file), pred_gaps))
            pred_files[const.TARGET_TAGS].append((str(pred_file), pred_target))
    if pipeline_options.pred_gaps:
        for pred_file in pipeline_options.pred_gaps:
            pred_gaps = read_file(pred_file)
            pred_files[const.GAP_TAGS].append((str(pred_file), pred_gaps))
    if pipeline_options.pred_source:
        for pred_file in pipeline_options.pred_source:
            pred_source = read_file(pred_file)
            pred_files[const.SOURCE_TAGS].append((str(pred_file), pred_source))
    if pipeline_options.pred_sents:
        for pred_file in pipeline_options.pred_sents:
            pred_sents = _read_sentence_scores(pred_file)
            pred_files[const.SENTENCE_SCORES].append(
                (str(pred_file), pred_sents))
    if pipeline_options.input_dir:
        for input_dir in pipeline_options.input_dir:
            input_dir = Path(input_dir)
            for target in const.TAGS:
                pred_file = input_dir.joinpath(target)
                if pred_file.exists() and pred_file.is_file():
                    pred_files[pred_file.name].append(
                        (str(pred_file), read_file(pred_file)))
            for target in [const.SENTENCE_SCORES, const.BINARY]:
                pred_file = input_dir.joinpath(target)
                if pred_file.exists() and pred_file.is_file():
                    pred_files[pred_file.name].append(
                        (str(pred_file),
                         _read_sentence_scores(str(pred_file))))

    # Numericalize Text Labels
    if pipeline_options.type == "tags":
        for tag_name in const.TAGS:
            for i in range(len(pred_files[tag_name])):
                fname, pred_tags = pred_files[tag_name][i]
                pred_files[tag_name][i] = (fname, _wmt_to_labels(pred_tags))
    return pred_files