예제 #1
0
파일: common.py 프로젝트: xingniu/sockeye
def test_translate_equivalence(data: Dict[str,
                                          Any], translate_params_equiv: str,
                               compare_output: bool):
    """
    Tests whether the output and scores generated by sockeye.translate with translate_params_equiv are equal to
    the previously generated outputs, referenced in the data dictionary.
    """
    out_path = os.path.join(data['work_dir'], "test.out.equiv")
    params = "{} {} {}".format(
        sockeye.translate.__file__,
        TRANSLATE_PARAMS_COMMON.format(model=data['model'],
                                       input=data['test_source'],
                                       output=out_path),
        translate_params_equiv)
    if 'test_source_factors' in data:
        params += TRANSLATE_WITH_FACTORS_COMMON.format(
            input_factors=" ".join(data['test_source_factors']))
    with patch.object(sys, "argv", params.split()):
        sockeye.translate.main()
    # Collect translate outputs and scores
    translate_outputs_equiv = collect_translate_output_and_scores(out_path)

    assert 'test_outputs' in data
    assert len(data['test_outputs']) == len(translate_outputs_equiv)
    if compare_output:
        for json_output, json_output_equiv in zip(data['test_outputs'],
                                                  translate_outputs_equiv):
            assert json_output['translation'] == json_output_equiv[
                'translation']
            assert abs(json_output['score'] - json_output_equiv['score']) < 0.01 or \
                   np.isnan(json_output['score'] - json_output_equiv['score'])
예제 #2
0
파일: common.py 프로젝트: xingniu/sockeye
def test_constrained_decoding_against_ref(data: Dict[str, Any],
                                          translate_params: str):
    constrained_inputs = create_reference_constraints(data['test_inputs'],
                                                      data['test_outputs'])
    new_test_source_path = os.path.join(data['work_dir'],
                                        "test_constrained.txt")
    with open(new_test_source_path, 'w') as out:
        for json_line in constrained_inputs:
            print(json_line, file=out)
    out_path_constrained = os.path.join(data['work_dir'],
                                        "out_constrained.txt")
    params = "{} {} {} --json-input --output-type translation_with_score --beam-size 1 --batch-size 1 --nbest-size 1".format(
        sockeye.translate.__file__,
        TRANSLATE_PARAMS_COMMON.format(model=data['model'],
                                       input=new_test_source_path,
                                       output=out_path_constrained),
        translate_params)
    with patch.object(sys, "argv", params.split()):
        sockeye.translate.main()
    constrained_outputs = collect_translate_output_and_scores(
        out_path_constrained)
    assert len(constrained_outputs) == len(
        data['test_outputs']) == len(constrained_inputs)
    for json_input, json_constrained, json_unconstrained in zip(
            constrained_inputs, constrained_outputs, data['test_outputs']):
        # Make sure the constrained output is the same as we got when decoding unconstrained
        assert json_constrained['translation'] == json_unconstrained[
            'translation']

    data['test_constrained_inputs'] = constrained_inputs
    data['test_constrained_outputs'] = constrained_outputs
    return data
예제 #3
0
def _test_constrained_type(constraint_type: str, data: Dict[str, Any],
                           translate_params: str):
    constrained_inputs = _create_constrained_inputs(constraint_type,
                                                    data['test_inputs'],
                                                    data['test_outputs'])
    new_test_source_path = os.path.join(data['work_dir'],
                                        "test_constrained.txt")
    with open(new_test_source_path, 'w') as out:
        for json_line in constrained_inputs:
            print(json_line, file=out)
    out_path_constrained = os.path.join(data['work_dir'],
                                        "out_constrained.txt")
    params = "{} {} {} --json-input --output-type translation_with_score".format(
        sockeye.translate.__file__,
        TRANSLATE_PARAMS_COMMON.format(model=data['model'],
                                       input=new_test_source_path,
                                       output=out_path_constrained),
        translate_params)
    with patch.object(sys, "argv", params.split()):
        sockeye.translate.main()
    constrained_outputs = collect_translate_output_and_scores(
        out_path_constrained)
    assert len(constrained_outputs) == len(
        data['test_outputs']) == len(constrained_inputs)
    for json_source, json_constrained, json_unconstrained in zip(
            constrained_inputs, constrained_outputs, data['test_outputs']):
        jobj = json.loads(json_source)
        if jobj.get(constraint_type) is None:
            # if there were no constraints, make sure the output is the same as the unconstrained output
            assert json_constrained['translation'] == json_unconstrained[
                'json_constrained']
        else:
            restriction = jobj[constraint_type][0]
            if constraint_type == 'constraints':
                # for positive constraints, ensure the constraint is in the constrained output
                assert restriction in json_constrained['translation']
            else:
                # for negative constraints, ensure the constraints is *not* in the constrained output
                assert restriction not in json_constrained['translation']
예제 #4
0
def test_translate_equivalence(data: Dict[str,
                                          Any], translate_params_equiv: str,
                               compare_output: bool):
    """
    Tests whether the output and scores generated by sockeye.translate with translate_params_equiv are equal to
    the previously generated outputs, referenced in the data dictionary.
    """
    out_path = os.path.join(data['work_dir'], "test.out.equiv")
    out_with_target_prefix_path = os.path.join(
        data['work_dir'], "test_with_target_prefix.out.equiv")

    # First set of params (with target prefix in JSON format)
    params = "{} {} {}".format(
        sockeye.translate.__file__,
        TRANSLATE_PARAMS_COMMON.format(
            model=data['model'],
            input=data['test_source_with_target_prefix'],
            output=out_with_target_prefix_path), translate_params_equiv)
    params += TRANSLATE_WITH_JSON_FORMAT
    with patch.object(sys, "argv", params.split()):
        sockeye.translate.main()

    # Collect translate outputs and scores
    translate_outputs_with_target_prefix_equiv = collect_translate_output_and_scores(
        out_with_target_prefix_path)

    # Second set of params (without using target prefix)
    params = "{} {} {}".format(
        sockeye.translate.__file__,
        TRANSLATE_PARAMS_COMMON.format(model=data['model'],
                                       input=data['test_source'],
                                       output=out_path),
        translate_params_equiv)
    if 'test_source_factors' in data:
        params += TRANSLATE_WITH_FACTORS_COMMON.format(
            input_factors=" ".join(data['test_source_factors']))
    with patch.object(sys, "argv", params.split()):
        sockeye.translate.main()
    # Collect translate outputs and scores
    translate_outputs_equiv = collect_translate_output_and_scores(out_path)

    assert 'test_outputs' in data
    assert 'test_with_target_prefix_outputs' in data
    assert len(data['test_outputs']) == len(
        data['test_with_target_prefix_outputs']) == len(
            translate_outputs_with_target_prefix_equiv) == len(
                translate_outputs_equiv)
    if compare_output:
        for json_output, json_output_with_target_prefix, json_output_equiv, json_output_with_target_prefix_equiv in zip(
                data['test_outputs'], data['test_with_target_prefix_outputs'],
                translate_outputs_equiv,
                translate_outputs_with_target_prefix_equiv):
            assert json_output['translation'] == json_output_equiv['translation'], \
                f"'{json_output['translation']}' vs. '{json_output_equiv['translation']}'"
            assert json_output_with_target_prefix['translation'] == json_output_with_target_prefix_equiv['translation'], \
                f"'{json_output_with_target_prefix['translation']}' vs. '{json_output_with_target_prefix_equiv['translation']}'"
            assert abs(json_output['score'] - json_output_equiv['score']) < 0.01 or \
                   np.isnan(json_output['score'] - json_output_equiv['score']), \
                f"'{json_output['score']}' vs. '{ json_output_equiv['score']}'"
            assert abs(json_output_with_target_prefix['score'] - json_output_with_target_prefix_equiv['score']) < 0.01 or \
                   np.isnan(json_output_with_target_prefix['score'] - json_output_with_target_prefix_equiv['score']), \
                f"'{json_output_with_target_prefix['score']}' vs. '{ json_output_with_target_prefix_equiv['score']}'"

            # Check translation output always includes target prefix tokens
            prefix = json_output_with_target_prefix['target_prefix'].split()
            translation = json_output_with_target_prefix['translation'].split()
            ending = min(len(prefix), len(translation))
            assert prefix[:ending] == translation[:ending], \
                f"'{prefix[:ending]}' vs. '{translation[:ending]}'"

            # Check translation output factors always include target prefix factors
            if 'target_prefix_factors' in json_output_with_target_prefix:
                prefix = json_output_with_target_prefix[
                    'target_prefix_factors']
                if len(prefix) > 0:
                    for j in range(1, len(prefix) + 1):
                        factors_from_translation = json_output_with_target_prefix[
                            f'factor{j}']
                        ending = min(len(prefix[j - 1]),
                                     len(factors_from_translation))
                        assert prefix[j - 1][:ending] == factors_from_translation[:ending], \
                            f"'{prefix[j - 1][:ending]}' vs. '{factors_from_translation[:ending]}' from . '{json_output_with_target_prefix}'"