def retrieve_gold_standard(pipeline_options, is_wmt18_format): golds = {} if pipeline_options.gold_target: gold_target = _wmt_to_labels(read_file(pipeline_options.gold_target)) if is_wmt18_format: gold_target, gold_gaps = _split_wmt18(gold_target) golds[const.GAP_TAGS] = gold_gaps golds[const.TARGET_TAGS] = gold_target # handling of gold source if pipeline_options.gold_source: gold_source = _wmt_to_labels(read_file(pipeline_options.gold_source)) golds[const.SOURCE_TAGS] = gold_source # handling of gold sentences if pipeline_options.gold_sents: gold_sentences = _read_sentence_scores(pipeline_options.gold_sents) golds[const.SENTENCE_SCORES] = gold_sentences return golds
def retrieve_predictions(pipeline_options, is_wmt18_pred_format): pred_files = {target: [] for target in const.TARGETS} if pipeline_options.pred_target: for pred_file in pipeline_options.pred_target: pred_target = read_file(pred_file) if is_wmt18_pred_format: pred_target, pred_gaps = _split_wmt18(pred_target) pred_files[const.GAP_TAGS].append((str(pred_file), pred_gaps)) pred_files[const.TARGET_TAGS].append((str(pred_file), pred_target)) if pipeline_options.pred_gaps: for pred_file in pipeline_options.pred_gaps: pred_gaps = read_file(pred_file) pred_files[const.GAP_TAGS].append((str(pred_file), pred_gaps)) if pipeline_options.pred_source: for pred_file in pipeline_options.pred_source: pred_source = read_file(pred_file) pred_files[const.SOURCE_TAGS].append((str(pred_file), pred_source)) if pipeline_options.pred_sents: for pred_file in pipeline_options.pred_sents: pred_sents = _read_sentence_scores(pred_file) pred_files[const.SENTENCE_SCORES].append( (str(pred_file), pred_sents)) if pipeline_options.input_dir: for input_dir in pipeline_options.input_dir: input_dir = Path(input_dir) for target in const.TAGS: pred_file = input_dir.joinpath(target) if pred_file.exists() and pred_file.is_file(): pred_files[pred_file.name].append( (str(pred_file), read_file(pred_file))) for target in [const.SENTENCE_SCORES, const.BINARY]: pred_file = input_dir.joinpath(target) if pred_file.exists() and pred_file.is_file(): pred_files[pred_file.name].append( (str(pred_file), _read_sentence_scores(str(pred_file)))) # Numericalize Text Labels if pipeline_options.type == "tags": for tag_name in const.TAGS: for i in range(len(pred_files[tag_name])): fname, pred_tags = pred_files[tag_name][i] pred_files[tag_name][i] = (fname, _wmt_to_labels(pred_tags)) return pred_files