Exemplo n.º 1
0
def enc_first(val, **kwargs):
    return encode(val, **kwargs)[0][1]
Exemplo n.º 2
0
                round(score * 20))

            # Determine SENTENCE_SIMILARITY
            score = 0
            for word in stripped_sentence:
                score += variable_vocabulary[variable][word]

            score = 1 - ((score / 4) - min_variable_occurence) / (
                max_variable_occurence - min_variable_occurence)
            scores[subject][variable][Scores.SENTENCE_SIMILARITY] = int(
                round(score * 20))

            # Determine RHYTHMIC_SCORE
            score = 0

            phonetic_result = cologne_phonetics.encode(
                sentence.strip(" ,;.:!?").lower())

            sounds_to_word_groups = {}

            for i in range(len(phonetic_result)):
                for j in range(i + 1, len(phonetic_result)):
                    (word1, sound1) = phonetic_result[i]
                    (word2, sound2) = phonetic_result[j]

                    for x in range(1, 1 + min(len(sound1), len(sound2))):

                        if ((sound1[-x:] == sound2[-x:])
                                and not (word1 == word2)):

                            adj_word1 = word1
                            adj_word2 = word2
Exemplo n.º 3
0
 def test_returns_altered(self):
     self.assertEqual(encode("bäTes")[0][0], "baetes")
Exemplo n.º 4
0
 def test_case_insensitive(self):
     self.assertEqual(encode("foo"), encode("FoO"))
Exemplo n.º 5
0
 def test_concatenation(self):
     self.assertTrue(encode("a-a") == encode("a a"))
     self.assertEqual(encode("a-a", concat=True), [("a-a", '0')])
     self.assertEqual(encode("a a", concat=True), [('a', '0'), ('a', '0')])
Exemplo n.º 6
0
 def multiple_before(self, char=None, before=None, exp=None):
     for b in before:
         self.assertEqual(encode(char + b), exp)
Exemplo n.º 7
0
def authores_features(segcite, sowiport):
    prep_data_ss = aux_author_features_gen(segcite, sowiport)

    # First author features
    # ============================
    if len(list(prep_data_ss["first_aut_ln"])) > 0 and len(prep_data_ss['sowi_ln_norm_ls']):
        First_author_OO = 1
        if normalizeinput_author1(prep_data_ss["ln_ls_order"][0])[0] == prep_data_ss["sowi_ln_norm_ls"][0]:
            First_author_Exact = 1
        else:
            First_author_Exact = 0

        if len(prep_data_ss["ln_ls_order"]) > 0 and len(prep_data_ss["sowi_ls_ls"]) > 0:
            if cologne_phonetics.encode(prep_data_ss["ln_ls_order"][0])[0][1] == \
                    cologne_phonetics.encode(prep_data_ss["sowi_ls_ls"][0])[0][1]:
                First_author_Phono = 1
            else:
                First_author_Phono = 0
        else:
            First_author_Phono = 0
        Pro_seg_fa = float(list(prep_data_ss['first_aut_ln'].values())[0])
    else:
        First_author_OO = 0
        First_author_Exact = 0
        First_author_Phono = 0
        lsfatemp = list(prep_data_ss['first_aut_ln'].values())
        if len(lsfatemp) > 0:
            Pro_seg_fa = float(lsfatemp[0])
        else:
            Pro_seg_fa = 0

    # levenstien Exact/Phono
    # ==============================
    lower_ls_sowi_ls = [x.lower().strip() for x in prep_data_ss['sowi_ls_ls']]
    lower_ls_segcite_ls = [x.lower().strip() for x in prep_data_ss['ln_ls_order']]

    phono_ls_sowi_ls = [cologne_phonetics.encode(x.lower().strip())[0][1] for x in prep_data_ss['sowi_ls_ls']]
    phono_ls_seg_ls = [cologne_phonetics.encode(x.lower().strip())[0][1] for x in prep_data_ss['ln_ls_order']]
    if len(lower_ls_sowi_ls) > 0 and len(lower_ls_segcite_ls) > 0:
        if len(prep_data_ss['sowi_ls_ls']) > 0 and len(prep_data_ss['sowi_gn_ls']) > 0 and len(
                prep_data_ss['ln_ls_order']) and len(prep_data_ss['gn_aux_dict']):
            try:
                ln_x = []
                for inx, dls in enumerate(prep_data_ss['ln_ls_order']):
                    ln_x.append(
                        normalizeinput_author1(dls)[0] + prep_data_ss['gn_aux_dict'].get(inx, [""])[0][0].lower())
                ln_y = []
                for ixy, it_ls in enumerate(prep_data_ss['sowi_ls_ls']):
                    ln_y.append(normalizeinput_author1(it_ls)[0] + prep_data_ss['sowi_gn_ls'][ixy][0][0])
                leven_pr_exact = levenshtein(ln_x, ln_y)
            except:
                leven_pr_exact = levenshtein(lower_ls_sowi_ls, lower_ls_segcite_ls)
        else:
            leven_pr_exact = levenshtein(lower_ls_sowi_ls, lower_ls_segcite_ls)
        leven_pr_phono = levenshtein(phono_ls_sowi_ls, phono_ls_seg_ls)

        Jacard_pr_exact = jaccard_similarity(lower_ls_sowi_ls, lower_ls_segcite_ls)
        jacard_pr_phono = jaccard_similarity(phono_ls_sowi_ls, phono_ls_seg_ls)
        author_oo = 1
    else:
        leven_pr_exact = 0
        leven_pr_phono = 0
        Jacard_pr_exact = 0
        jacard_pr_phono = 0
        author_oo = 0

    intersect = list(set(lower_ls_sowi_ls).intersection(set(lower_ls_segcite_ls)))
    # ==============================
    keys_ls_dict = list(prep_data_ss["ln_dict_score"].keys())
    ls_pr_ln_temp = []
    for item_key in keys_ls_dict:
        if item_key.lower() in intersect:
            ls_pr_ln_temp.append(float(prep_data_ss["ln_dict_score"][item_key]))
    if len(ls_pr_ln_temp) > 0:
        jacard_pr_exact_scr = sum(ls_pr_ln_temp) / len(set(lower_ls_sowi_ls + lower_ls_sowi_ls))
    else:
        jacard_pr_exact_scr = 0

    return [First_author_OO, author_oo, First_author_Exact, First_author_Phono, Pro_seg_fa, leven_pr_exact,
            leven_pr_phono, Jacard_pr_exact, jacard_pr_phono, jacard_pr_exact_scr]