Пример #1
0
def evaluate_predictions(**kwargs):
    distance = 0
    diff_count = 0
    for row_i, row in search_utils.chinese_names.iterrows():
        # if row_i > 10:
        #     break
        if row_i % 50 == 0:
            print("{}% complete".format(100 * row_i /
                                        search_utils.chinese_names.shape[0]))
        english, _, _, target_pinyin, _, _, _ = row
        english = search_utils.normalize(english)
        target_pinyin = ''.join(
            filter(lambda x: x != ' ', search_utils.normalize(target_pinyin)))
        #print(english)
        if search_utils.is_vowel(english[0]): english = 'S' + english
        if search_utils.is_vowel(english[-1]): english = english + 'E'
        output_pinyin = ''.join(find_closest_pinyin(english, **kwargs)[1])
        if output_pinyin != target_pinyin:
            #print (english, output_pinyin, target_pinyin)
            diff_count += 1
            distance += edit_distance.edit_distance_pinyin(
                target_pinyin, output_pinyin)

    print(
        "Out of {} names, {} were different, with an average edit distance of {} ({} for just the different pairs)"
        .format(search_utils.chinese_names.shape[0], diff_count,
                distance / search_utils.chinese_names.shape[0],
                distance / diff_count))
Пример #2
0
def evaluate_predictions():
    all_names = search_utils.chinese_names
    distance = 0
    diff_count = 0
    for row_i, row in all_names.iterrows():
        if row_i % 20 == 0:
            print("{}% complete".format(100 * row_i / all_names.shape[0]))
        english, _, _, target_pinyin, _, _, _ = row

        result = baseline(english)
        result_pinyin = pinyin(result)

        target_pinyin = ''.join(
            filter(lambda x: x != ' ', search_utils.normalize(target_pinyin)))
        output_pinyin = ''.join([seg[0] for seg in result_pinyin])
        if output_pinyin != target_pinyin:
            #print (english, output_pinyin, target_pinyin)
            diff_count += 1
            distance += edit_distance.edit_distance_pinyin(
                target_pinyin, output_pinyin)

    print(
        "Out of {} names, {} were different, with an average edit distance of {} ({} for just the different pairs)"
        .format(all_names.shape[0], diff_count, distance / all_names.shape[0],
                distance / diff_count))
Пример #3
0
def process_baseline(oracle_csv):
    df = pd.ExcelFile(oracle_csv).parse('Sheet1')
    df_baseline_pinyin = pd.ExcelFile(os.path.join("..", "data", "proposal", "BaselineResponses.xlsx")).parse('Sheet1')
    #df = pd.read_csv(oracle_csv)
    names = df["English"]
    o1 = df["Pinyin_O1"]
    o2 = df["Pinyin_O2"]
    bp = df_baseline_pinyin["Baseline"]

    distance = 0
    diff_count = 0
    for name, name1, name2, pinyin in zip(names, o1, o2, bp):
        if name != name1 or name != name2:
            diff_count += 1
            baseline_guess = baseline.baseline(name)
            print(baseline_guess)
            dist_o1 = edit_distance.edit_distance_pinyin(pinyin, name1)
            print("Distance between", pinyin, "and", name1, ":", dist_o1)
            dist_o2 = edit_distance.edit_distance_pinyin(pinyin, name2)
            print("Distance between", pinyin, "and", name2, ":", dist_o2)
            distance += ((dist_o1 + dist_o2) / 2)

    # take the average over ALL names
    return (distance/len(names), diff_count, len(names))
Пример #4
0
def process_oracle(oracle_csv):
    df = pd.ExcelFile(oracle_csv).parse('Sheet1')
    #df = pd.read_csv(oracle_csv)
    o1 = df["Pinyin_O1"]
    o2 = df["Pinyin_O2"]

    distance = 0
    diff_count = 0
    for name1, name2 in zip(o1, o2):
        if name1 != name2:
            diff_count += 1
            dist = edit_distance.edit_distance_pinyin(name1, name2) # 1. penalizes wrong tones as 0.5 
                                                        # 2. doesn't do the "count characters in common" thing because these are longer
            # only print stuff that's different
            print("Distance between", name1, "and", name2, ":", dist)
            distance += dist
    # take the average over ALL names, not just the ones that were wrong
    return (distance/len(o1), diff_count, len(o1))
Пример #5
0
def evaluateAllLines(encoder, decoder):
    distance = 0
    diff_count = 0
    for pair in pairs:
        output_name = evaluate(encoder, decoder, pair[0])
        # remove the space for edit distance calculations for consistency with baseline
        output_name = ''.join(filter(
            lambda l: l != ' ',
            output_name[:-1]))  # need to get rid of the <EOS> string at end
        target_name = ''.join(filter(lambda l: l != ' ', pair[1]))
        if output_name != target_name:
            #print(output_name, target_name)
            diff_count += 1
            distance += edit_distance.edit_distance_pinyin(
                target_name, output_name)

    print(
        "Out of {} names, {} were different, with an average edit distance of {} ({} for just the different pairs)"
        .format(len(pairs), diff_count, distance / len(pairs),
                distance / diff_count))