示例#1
0
def similarityMetrics(df_train):

    #### -- Distances: Levenshtein, Cosine, Jaccard, Jaro
    ## -- Search term vs Product Title
    # Levensthein distance between 'PT_text' & 'ST_text'
    df_train['Leven_sim_PT'] = df_train.apply(
        lambda x: 1 - stringdist.levenshtein_norm(x['PT_text'], x['ST_text']),
        axis=1)

    df_train['JC_sim_PT'] = df_train.apply(
        lambda x: utils.get_jaccard_sim(x['PT_stem'], x['ST_stem']), axis=1)

    # cosine
    df_train['Cosine_sim_PT'] = df_train.apply(lambda x: utils.get_cosine_sim(
        ' '.join(x['PT_stem']), ' '.join(x['ST_stem'])),
                                               axis=1)

    ## -- Search term vs Description
    # Levensthein distance between 'Keywords_Descr' as text & 'ST_text'
    df_train['Leven_sim_PD'] = df_train.apply(
        lambda x: 1 - stringdist.levenshtein_norm(
            ' '.join(x['Keywords_Descr']), x['ST_text']),
        axis=1)

    # Jaccard similarity
    df_train['JC_sim_PD'] = df_train.apply(
        lambda x: utils.get_jaccard_sim(x['PD_stem'], x['ST_stem']), axis=1)

    df_train['Cosine_sim_PD'] = df_train.apply(lambda x: utils.get_cosine_sim(
        ' '.join(x['PD_stem']), ' '.join(x['ST_stem'])),
                                               axis=1)

    ## -- Search term vs Attributes
    df_train['Atrr_stem'] = df_train['Atrr_stem'].apply(
        lambda d: d if isinstance(d, list) else [])

    # Levensthein distance between 'PT_text' & 'ST_text'
    df_train['Leven_sim_Atrr'] = df_train.apply(lambda x: utils.get_leven(x),
                                                axis=1)

    # Jaccard similarity
    df_train['JC_sim_Atrr'] = df_train.apply(
        lambda x: utils.get_jaccard_sim(x['Atrr_stem'], x['ST_stem']), axis=1)

    # Cosine similarity
    df_train['Cosine_sim_Atrr'] = df_train.apply(
        lambda x: utils.get_cosine_sim(' '.join(x['Atrr_stem']), ' '.join(x[
            'ST_stem'])),
        axis=1)
    df_train_sims = df_train[[
        'id', 'product_uid', 'JC_sim_PT', 'Cosine_sim_PT', 'Leven_sim_PD',
        'JC_sim_PD', 'Cosine_sim_PD', 'JC_sim_Atrr', 'Cosine_sim_Atrr',
        'Leven_sim_Atrr'
    ]]
    df_similarities = df_train_sims

    return df_train, df_similarities
示例#2
0
def classify_enum_value(col_value, cde_index, g):
    output_dict = {'observedValue': str(col_value), 'permissibleValue': {}}
    query = "CALL db.index.fulltext.queryNodes(\"ansindex\",\"{0:s}\") YIELD node as a, score ".format(
        str(col_value))
    query += "MATCH (n:CDE) - [:PERMITS] - (ans:Answer) - [:CAN_BE] - (a:AnswerText) WHERE ID(n) = {0:d} ".format(
        cde_index)
    query += "RETURN ID(ans), 'Answer', score, a.name"
    result = query_graph(query, g)
    answer_values = result.values()
    query = "CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node as s, score ".format(
        str(col_value))
    query += "MATCH (n:CDE) - [:PERMITS] - (ans:Answer) - [:EQUALS] - (con:Concept) - [:IS_CALLED] - (s:Synonym) WHERE ID(n) = {0:d} ".format(
        cde_index)
    query += "RETURN ID(ans), 'Synonym', score, s.name, con.CODE"
    result = query_graph(query, g)
    syn_values = result.values()
    all_results = answer_values + syn_values
    all_results = [i for i in all_results if i[2] > FT_SEARCH_CUTOFF]
    if len(all_results) > 0:
        all_results.sort(key=lambda z: z[2], reverse=True)
        ans_index = all_results[0][0]
        ans_results = [i for i in all_results if i[0] == ans_index]
        # Now we need to choose the best synonym
        synonyms = [i for i in ans_results if i[1] == 'Synonym']
        if len(
                synonyms
        ) > 0:  #Choose the best based on 1: search score, and 2: stringdist
            synonyms.sort(key=lambda z: (-z[
                2], stringdist.levenshtein_norm(z[3], col_value)))
            output_dict['permissibleValue']['value'] = str(synonyms[0][3])
            output_dict['permissibleValue']['conceptCode'] = 'ncit:' + str(
                synonyms[0][4])
        else:
            query = "MATCH (a:Answer) - [:EQUALS] - (c:Concept) - [:IS_CALLED] - (s:Synonym) WHERE ID(a) = {0:d} RETURN c.CODE,s.name".format(
                ans_index)
            result = query_graph(query, g)
            values = result.values()
            if len(values) > 0:
                values.sort(key=lambda z: stringdist.levenshtein_norm(
                    str(col_value).lower(),
                    str(z[1]).lower()))
                output_dict['permissibleValue']['value'] = str(values[0][1])
                output_dict['permissibleValue']['conceptCode'] = 'ncit:' + str(
                    values[0][0])
            else:
                output_dict['permissibleValue']['value'] = str(
                    ans_results[0][3])
                output_dict['permissibleValue']['conceptCode'] = None
    else:
        output_dict['permissibleValue']['value'] = 'NOMATCH'
        output_dict['permissibleValue']['conceptCode'] = None
    return output_dict
示例#3
0
def genetic_diversity(pop):
    dists = [
        stringdist.levenshtein_norm(pop[x].genome, pop[y].genome)
        for x in range(len(pop)) for y in range(x + 1, len(pop))
    ]
    variety = round(sum(dists) * 100 / len(dists))
    return variety
示例#4
0
def find_best_match(orig_name, names_list: list) -> dict:
    min_res = 1.0
    best_name = ""
    postfixes_list = [
        "AGA", "Levels", "Activision", "Sega", "Demo", "Demo1", "Demo2",
        "CD32", "CDTV", "Psygnosis", "NTSC", "Disk", "DemoPlay"
    ]
    name1 = orig_name
    for postfix in postfixes_list:
        if name1.endswith(postfix): name1 = name1[:-len(postfix)]

    if name1.endswith("Fr") or name1.endswith("De") or name1.endswith("Pl"):
        name1 = name1[:-2]

    for name2 in names_list:
        if min_res == 0.0: break
        for temp_name in permut(name2):
            res = stringdist.levenshtein_norm(name1, temp_name)
            if temp_name.replace(" ", "").lower().startswith(name1.lower()):
                min_res = 0
                best_name = name2
                break
            if res < min_res:
                min_res = res
                best_name = name2
    return {"res": min_res, "retro": orig_name, "lemon": best_name}
示例#5
0
def query_editdist(nsentence):
    distList = [
        sd.levenshtein_norm(str(d), nsentence)
        for d in train_set['Keyword'].values
    ]
    resind = np.argmin(distList)
    return train_set.iloc[resind]['Topic']
示例#6
0
def matching_NED(gold_fragments, matches_df, plot_hist=False):
    " percentage of phonemes shared by the two strings \
    normalization is done wrt number of different labels instead of frame counts "

    if len(matches_df) == 0: return 1.0

    neds = np.zeros(len(matches_df))

    for i, row in matches_df.iterrows():

        filename, start, end = (row['f1'], row['f1_start'], row['f1_end'])
        labels1 = fragment_tokenizer(gold_fragments, filename, start, end)

        filename, start, end = (row['f2'], row['f2_start'], row['f2_end'])
        labels2 = fragment_tokenizer(gold_fragments, filename, start, end)

        try:
            neds[i] = stringdist.levenshtein_norm(labels1, labels2)
        except:
            neds[i] = strdist(labels1, labels2)

    if plot_hist:
        plt.hist(neds)
        plt.title('Normalized Edit Distance Histogram')
        plt.show()

    return sum(neds) / len(neds)
示例#7
0
def clus_NED(gold_fragments, nodes_df, clusters_list):
    " frameler uzerinden degil, transcription uzerinden hesaplaniyor "

    if len(nodes_df) == 0: return 1.0

    P_clus = []
    for clus in clusters_list:
        for pair in itertools.combinations(clus, 2):
            P_clus.append(list(pair))

    neds = np.zeros(len(P_clus))

    for i, pair in enumerate(P_clus):

        labels = []

        for p in pair:
            labels.append(nodes_df.types[p])

        # in order not to count garbage classes from different sequences as the same label
        if (len(labels[0]) == 0) | (len(labels[1]) == 0):
            neds[i] = 1.
        else:
            try:
                neds[i] = stringdist.levenshtein_norm(labels[0], labels[1])
            except:
                neds[i] = strdist(labels[0], labels[1])


#        neds[i] = stringdist.levenshtein_norm(labels[0], labels[1])

    return sum(neds) / len(neds)
示例#8
0
文件: mining.py 项目: wafec/javadocto
def compare_with_levenshtein_distance(value_a, value_b):
    str_a = 'a' + _to_str_for_distance_calculation(value_a)
    str_b = 'a' + _to_str_for_distance_calculation(value_b)
    try:
        value = stringdist.levenshtein_norm(str_a, str_b)
        return value
    except Exception as exc:
        print(str_a, str_b)
        raise exc
示例#9
0
def compute_levenshtein_sim(str1, str2):
    """
    Computer the Levenshtein Similarity between two strings using 3-grams, if one string
    is not contained in the other.
    """

    if str1 in str2 or str2 in str1:
        return 1

    return 1 - stringdist.levenshtein_norm(str1, str2)
示例#10
0
def preview_author(author="rembrandt"):
    distance = {}
    for n in filter(None, favorites.keys()):
        lh = stringdist.levenshtein_norm(author, n)
        distance[n] = lh

    chosen_author = min(distance, key=distance.get)
    art_ids = favorites[chosen_author]
    chosen_art_id = random.choice(art_ids)
    #print(chosen_art_id)
    # preview item found by keyword on device 8292
    preview_item(token, chosen_art_id)
    return(author_artwork[chosen_art_id],chosen_author)
示例#11
0
def mergeNames(row):
    names = [row['product_name'],row['Name']]
    if type(names[0])!=str:
        return names[1]
    if type(names[1])!=str:
        return names[0]
    
    #If names are differents, we use the levenshtein distance to measure the difference
    if names[0]!=names[1]:
        m = min(len(names[0]),len(names[1]))
        dist = stringdist.levenshtein_norm(names[0][:m],names[1][:m])
        if dist>0.3: 
            return float('NaN')
    return names[0]
示例#12
0
def compare_tweets(tweet):
    for item in search_tweets():
        row = item
        if stringdist.levenshtein_norm(row, tweet) > DISTANCE:
            cprint(
                'GetTweetsError: New Tweet is too similar to old tweets. Trying again.'
            )
            logging.error(
                str('GetTweetsError: New Tweet is too similar to old tweets. Trying again.'
                    ))
            raise GetTweetsError
            return False
        else:
            return True
示例#13
0
def common_words_leven(tokens_1, tokens_2):
    # N = 0
    common_terms = []
    tokens_1 = list(set(tokens_1))
    tokens_2 = list(set(tokens_2))

    for token1 in tokens_1:
        for token2 in tokens_2:
            if 1 - stringdist.levenshtein_norm(token1, token2) > 0.85:
                # N += 1
                common_terms.append(token2)
    try:
        return common_terms
    except:
        return common_terms
示例#14
0
def metaphone_suggestions(word, count):
    spelling_phone = mphone(word)
    suggestions = []
    if spelling_phone in metaphone_dict:
        suggestions.extend(metaphone_dict[spelling_phone])

    additional_suggestions = []
    for eword in edit_distance_1(spelling_phone):
        if eword.upper() in metaphone_dict:
            additional_suggestions.extend(metaphone_dict[eword.upper()])
    additional_suggestions.sort(key=lambda x: stringdist.levenshtein_norm(x, word))
    #suggestions.sort(key=lambda x: stringdist.levenshtein_norm(x, word))
    suggestions.extend(additional_suggestions)
    
    #return list(dict.fromkeys(suggestions))[0:5]
    suggestions = [sug[0].upper() + sug[1:] if word[0].upper() == word[0] else sug for sug in list(dict.fromkeys(suggestions)) if len(sug) > 1]
    return suggestions[:count]
示例#15
0
def suggestions(word, count=5):
    spelling_phone = pkey(word)
    suggestions = []
    #Primary Keys
    if spelling_phone in phonetic_dict:
        suggestions.extend(phonetic_dict[spelling_phone])

    #Supplementary Keys
    if len(suggestions) < count:
        additional_suggestions = []
        for eword in edit_distance_1(spelling_phone):
            if eword.upper() in phonetic_dict:
                additional_suggestions.extend(phonetic_dict[eword.upper()])
        additional_suggestions.sort(
            key=lambda x: stringdist.levenshtein_norm(x, word))
        suggestions.extend(additional_suggestions)

    suggestions = [
        sug[0].upper() + sug[1:] if word[0].upper() == word[0] else sug
        for sug in list(dict.fromkeys(suggestions)) if len(sug) > 1
    ]
    return suggestions[:count]
示例#16
0
def getClosestDocs(wiki_entities, entities):
    entities = list(entities)
    for i in range(len(entities)):
        entities[i] = str(entities[i])
    selected_docs = []
    for ent in entities:
        ent = ud.normalize('NFC', ent)
        if ent in wiki_entities:
            best_match = ent
        else:
            best = 1.1
            best_match = ""
            for we in wiki_entities:
                dist = stringdist.levenshtein_norm(we, ent)
                if dist < best:
                    best = dist
                    best_match = we
        best_match = best_match.replace(" ", "_")
        best_match = best_match.replace("/", "-SLH-")
        best_match = best_match.replace("(", "-LRB-")
        best_match = best_match.replace(")", "-RRB-")
        selected_docs.append(best_match)
    return selected_docs, entities
def get_corpus_dist_set(query,
                        idf_dict,
                        normalized_tf_list,
                        soundex_dict,
                        avg,
                        inverse_mapping,
                        doc_title_list,
                        C,
                        thresh=0.2,
                        K=5,
                        wt=0.5):
    """
    This method uses other methods to:
    1. preprocess the query. 
    2. Matches the closest terms in the corpus according to stemming and levenstein distance. 
    3. Computes distance score by : wt*(lev_distance) + (1-wt)*soundex_distance
    4. Suggests closest K words with score <= threshold. 
    5. Removes the terms in close set whose idf < average idf across the corpus(to supress false positives) 
    6. Uses the get_top_K method to retrieve documents with highest score on the updated query
     """
    query = preprocess_query(query, C)
    corpus_tokens = []
    #close_term_dict = {}
    soundex = fuzzy.Soundex(4)
    #query = unicode(source, 'utf-8')
    query_tokens = nltk.word_tokenize(query)
    #print(query_tokens)
    for key in idf_dict.keys():
        corpus_tokens.append(key)
    #print(len(corpus_tokens))

    cnt = 0
    lev_query = ""
    for token in query_tokens:
        query_lev_dict = {}
        for term in corpus_tokens:
            query_lev_dict[term] = 0
            soundex_notation_term = soundex_dict[term]
            soundex_notation_token = get_soundex(token)
            soundex_distance = stringdist.levenshtein_norm(
                soundex_notation_term, soundex_notation_token)
            lev_distance = stringdist.levenshtein_norm(token, term)
            query_lev_dict[term] = wt * (lev_distance) + (
                1 - wt) * soundex_distance

        sorted_d = dict(
            sorted(query_lev_dict.items(),
                   key=operator.itemgetter(1),
                   reverse=False))
        #print(sorted_d)
        k = 0
        close_terms = []
        for item in sorted_d:
            if (k == K):
                break
            if (sorted_d[item] > thresh):
                break
            close_terms.append(str(item))
            #if(item == token):
            #    break
            k = k + 1

        #close_term_dict[token] = close_terms
        cnt = 0
        flg = False
        for i in close_terms:
            if (i != query_tokens[cnt]):
                flg = True

            lev_query = lev_query + i + " "
        #print(token, close_terms)
    if (flg == True):
        print("Searching instead for " + lev_query + ":")

    get_top_K(lev_query, 10, normalized_tf_list, idf_dict, inverse_mapping,
              doc_title_list, C)
示例#18
0
文件: scores.py 项目: zacateras/yansp
def levenshtein_norm(gold, system):
    return stringdist.levenshtein_norm(gold.lemma, system.lemma)
示例#19
0
def getFeatures(df_train, df_descr, df_attr):

    #### -- Features from TRAINSET + Descriptions + Attributes
    # merge the above dataframes
    df_train = df_train.merge(df_descr,
                              left_on='product_uid',
                              right_on='product_uid',
                              how='left')
    df_train = df_train.merge(df_attr,
                              left_on='product_uid',
                              right_on='product_uid',
                              how='left')

    #### -- General counts about numerics and non numerics in Product title and Search term
    # number of numeric terms in product_title
    df_train['N_numerics_PT'] = df_train['PT_numerics'].apply(lambda x: len(x))

    # number of numeric terms in search_term
    df_train['N_numerics_ST'] = df_train['ST_numerics'].apply(lambda x: len(x))

    # number of non numeric terms in product_title
    df_train['N_non_numerics_PT'] = df_train['PT_Non_numerics'].apply(
        lambda x: len(x))

    # number of non numeric terms in search_term
    df_train['N_non_numerics_ST'] = df_train['ST_Non_numerics'].apply(
        lambda x: len(x))

    # common nonnumeric terms between 'PT_Non_numerics' & 'ST_Non_numerics' with levensthein distance
    df_train['Common_words_leven'] = df_train.apply(
        lambda x: utils.common_words_leven(x['PT_Non_numerics'], x[
            'ST_Non_numerics']),
        axis=1)

    #### -- Common terms between Search term & Product title
    # number of common nonnumeric terms between 'PT_Non_numerics' & 'ST_Non_numerics' with levensthein distance
    df_train['N_common_words_leven'] = df_train['Common_words_leven'].apply(
        lambda x: len(x))

    # Jaccard similarity based on the above common words with levensthein
    df_train['JC_sim'] = df_train.apply(lambda x: utils.get_jaccard_sim(
        x['PT_Non_numerics'], x['ST_Non_numerics'], x['Common_words_leven']),
                                        axis=1)

    #### -- Non numeric terms of Search term that are substrings of Product title | Descrtiption | Attribute
    ## -- PRODUCT TITLE
    # list of terms of search_term_tokens that are substrings of PT_lower
    df_train['Substrs_PT_x'] = df_train.apply(
        lambda x: utils.n_substrings(x['ST_Non_numerics'], x['PT_lower']),
        axis=1)

    # Number of terms of search_term_tokens that are substrings of product_title_lower
    df_train['N_substrs_PT_x'] = df_train['Substrs_PT_x'].apply(
        lambda x: len(x))

    ## -- PRODUCT DESCRIPTION
    # list of terms of search_term_tokens that are substrings of PD_lower
    df_train['Substrs_PD_x'] = df_train.apply(
        lambda x: utils.n_substrings(x['ST_Non_numerics'], x['PD_lower']),
        axis=1)

    # Number of terms of search_term_tokens that are substrings of product_title_lower
    df_train['N_substrs_PD_x'] = df_train['Substrs_PD_x'].apply(
        lambda x: len(x))

    ## -- PRODUCT ATTRIBUTES
    # list of terms of search_term_tokens that are substrings of PD_lower
    df_train['Substrs_Atr_x'] = df_train.apply(
        lambda x: utils.n_substrings(x['ST_Non_numerics'], x['Atrr_text']),
        axis=1)

    # Number of terms of search_term_tokens that are substrings of product_title_lower
    df_train['N_substrs_Atr_x'] = df_train['Substrs_Atr_x'].apply(
        lambda x: len(x))

    # percentage of terms of search_term_tokens that are substrings of PT_lower or PD_lower or Atrr_text
    df_train['Perc_substrs_x'] = df_train.apply(
        lambda x: utils.perc_xxx(x['Substrs_PT_x'], x['Substrs_PD_x'], x[
            'Substrs_Atr_x'], x['ST_Non_numerics']),
        axis=1)

    ## -- PRODUCT TITLE
    # list of terms of search_term_tokens that are substrings of PT_lower
    df_train['Substrs_PT_y'] = df_train.apply(
        lambda x: utils.n_substrings(x['ST_numerics'], x['PT_lower']), axis=1)

    # Number of terms of search_term_tokens that are substrings of product_title_lower
    df_train['N_substrs_PT_y'] = df_train['Substrs_PT_y'].apply(
        lambda x: len(x))

    ## -- PRODUCT DESCRIPTION
    # list of terms of search_term_tokens that are substrings of PD_lower
    df_train['Substrs_PD_y'] = df_train.apply(
        lambda x: utils.n_substrings(x['ST_numerics'], x['PD_lower']), axis=1)

    # Number of terms of search_term_tokens that are substrings of product_title_lower
    df_train['N_substrs_PD_y'] = df_train['Substrs_PD_y'].apply(
        lambda x: len(x))

    ## -- PRODUCT ATTRIBUTES
    # list of terms of search_term_tokens that are substrings of PD_lower
    df_train['Substrs_Atr_y'] = df_train.apply(
        lambda x: utils.n_substrings(x['ST_numerics'], x['Atrr_text']), axis=1)

    # Number of terms of search_term_tokens that are substrings of product_title_lower
    df_train['N_substrs_Atr_y'] = df_train['Substrs_Atr_y'].apply(
        lambda x: len(x))

    # percentage of terms of search_term_tokens that are substrings of PT_lower or PD_lower or Atrr_text
    df_train['Perc_substrs_y'] = df_train.apply(
        lambda x: utils.perc_xxx(x['Substrs_PT_y'], x['Substrs_PD_y'], x[
            'Substrs_Atr_y'], x['ST_numerics']),
        axis=1)

    #### -- Levensthein distance between Search terms & Product title"""
    # Levensthein distance between 'product_title_text' & 'search_term_text'
    df_train['Leven_sim_ST_PT'] = df_train.apply(
        lambda x: 1 - stringdist.levenshtein_norm(x['PT_text'], x['ST_text']),
        axis=1)

    #### -- Keywords of Description that appear in the Search term (with levensthein distance)
    # list of Descripton Keywords that appear in the 'ST_Non_numerics' with levensthein distance
    df_train['Keywords_leven'] = df_train.apply(
        lambda x: utils.common_words_leven(x['Keywords_Descr'], x[
            'ST_Non_numerics']),
        axis=1)

    # number of Descripton Keywords that appear in the 'ST_Non_numerics' with levensthein distance
    df_train['N_keywords_leven'] = df_train['Keywords_leven'].apply(
        lambda x: len(x))

    # keep only those columns
    df_train2 = df_train[[
        'product_uid', 'N_numerics_PT', 'N_numerics_ST', 'N_non_numerics_PT',
        'N_non_numerics_ST', 'N_common_words_leven', 'JC_sim',
        'N_substrs_PT_x', 'N_substrs_PD_x', 'N_substrs_Atr_x',
        'Perc_substrs_x', 'N_substrs_PT_y', 'N_substrs_PD_y',
        'N_substrs_Atr_y', 'Perc_substrs_y', 'Leven_sim_ST_PT',
        'N_keywords_leven', 'relevance'
    ]]

    return df_train, df_train2
示例#20
0
def get_leven(x):
    try:
        return 1 - stringdist.levenshtein_norm(' '.join(x['Atrr_stem']), x['ST_text'])
    except:
        # print('error')
        return 0
示例#21
0
 def test_levenshtein_norm_matching(self):
     """It should return right normalized dist when strings match"""
     self.assertEqual(levenshtein_norm('abcde', 'abcde'), 0)
def evaluateMetadata(folder, golden):
    jsonfiles = [f for f in os.listdir(folder) if f.endswith('.json')]
    number_of_files = len(
        jsonfiles) + 1  #add 1 for use in range(1, number_of_files)

    rb = open_workbook(golden)
    sheet1 = rb.sheet_by_index(0)
    database = [[sheet1.cell_value(r, c) for c in range(sheet1.ncols)]
                for r in range(sheet1.nrows)]  #golden_database
    data = ""

    def fbase(index):
        return database[row][index]

    def fdata(index):
        return data[index]

    #title
    title = 0
    title_extracted = 0
    title_extracted_correctly = 0
    title_NLD = 0
    for i in range(1, number_of_files):
        row = i
        filename = folder + "/" + "metadata_" + database[i][0] + ".json"
        #print(filename)
        try:
            with open(filename) as f:
                data = json.load(f)
                try:
                    X = fbase(50).lower().strip()
                except:
                    X = ""
                try:
                    Y = "".join(fdata("dc.title")).lower().strip()
                except:
                    Y = ""
                for p in string.punctuation + "–" + "—":
                    p = "\\" + str(p)
                    X = re.sub(p, " ", re.sub("\s+", " ", X)).strip()
                    Y = re.sub(p, " ", re.sub("\s+", " ", Y)).strip()
                if X != "":
                    title += 1
                if Y != "":
                    title_extracted += 1
                if X != "" and Y != "" and stringdist.levenshtein_norm(
                        str(X).lower(),
                        str(Y).lower()) <= .1:
                    title_NLD += 1
                if X != "" and X == Y:
                    title_extracted_correctly += 1
        except:
            data = ""
    title_precision = title_extracted_correctly / title_extracted
    NLD_precision = title_NLD / title_extracted
    recall = title_extracted_correctly / title
    NLD_recall = title_NLD / title
    try:
        F1 = 2.0 * title_precision * recall / (title_precision + recall)
    except:
        F1 = 0.0
    try:
        NLD_F1 = 2.0 / (1.0 / NLD_precision + 1.0 / NLD_recall)
    except:
        NLD_F1 = 0.0
    title = [title_precision, NLD_precision, recall, NLD_recall, F1, NLD_F1]

    #abstract
    description = 0
    description_extracted = 0
    description_extracted_correctly = 0
    description_NLD = 0
    i = 0
    for i in range(1, number_of_files):
        row = i
        filename = folder + "/" + "metadata_" + database[i][0] + ".json"
        try:
            with open(filename) as f:
                data = json.load(f)
                X = fbase(16).lower()

                Y = "".join(fdata("dc.description.abstract")).lower()
                for p in string.punctuation:
                    p = "\\" + str(p)
                    X = re.sub(p, " ", re.sub("\s+", " ", X)).strip()
                    Y = re.sub(p, " ", re.sub("\s+", " ", Y)).strip()
                if X != "":
                    description += 1
                if Y != "":
                    description_extracted += 1
                if X != "" and stringdist.levenshtein_norm(str(X),
                                                           str(Y)) <= .1:
                    description_NLD += 1
                if X != "" and X == Y:
                    description_extracted_correctly += 1
        except:
            pass
    description_precision = description_extracted_correctly / description_extracted
    NLD_precision = description_NLD / description_extracted
    recall = description_extracted_correctly / description
    NLD_recall = description_NLD / description
    try:
        F1 = 2.0 * description_precision * recall / (description_precision +
                                                     recall)
    except:
        F1 = 0.0
    try:
        NLD_F1 = 2.0 / (1.0 / NLD_precision + 1.0 / NLD_recall)
    except:
        NLD_F1 = 0.0

    abstract = [
        description_precision, NLD_precision, recall, NLD_recall, F1, NLD_F1
    ]

    #editor
    editor = 0
    editor_extracted = 0
    edit_precision = 0.0
    edit_recall = 0.0
    n = 0
    NLD_precision = 0
    NLD_recall = 0
    for i in range(1, number_of_files):
        row = i
        filename = folder + "/" + "metadata_" + database[i][0] + ".json"
        try:
            with open(filename) as f:
                data = json.load(f)
                X = str(fbase(5)).strip()
                if X != "":
                    editor += 1
                try:
                    Y = "".join(fdata("dc.contributor.editor")).strip()
                except:
                    Y = ""
                X_list = X.split("||")
                Y_list = Y.split("||")
                Common = list(set(X_list) & set(Y_list))

                if (len(Y_list) != 0):
                    edit_precision += len(Common) / len(Y_list)
                if (len(X_list) != 0):
                    edit_recall += len(Common) / len(X_list)

                #NLD_similarity calculation
                common_len = 0
                check_list = Y_list.copy()
                for i in range(len(X_list)):
                    for j in range(len(Y_list)):
                        if X_list[i] != '' and Y_list[j] != '':
                            if stringdist.levenshtein_norm(
                                    str(X_list[i]), str(Y_list[j])) <= .1:
                                Y_list[j] = ''  #added
                                common_len += 1
                                break
                Y_list = check_list.copy()
                if len(Y_list) != 0:
                    NLD_precision += common_len / len(Y_list)
                if len(X_list) != 0:
                    NLD_recall += common_len / len(X_list)
                if Y != "":
                    editor_extracted += 1
        except:
            pass
    edit_precision /= editor_extracted
    edit_recall /= editor
    NLD_precision /= editor_extracted
    NLD_recall /= editor
    try:
        edit_F1 = 2.0 * edit_precision * edit_recall / (edit_precision +
                                                        edit_recall)
    except:
        edit_F1 = 0.0
    try:
        NLD_F1 = 2.0 / (1.0 / NLD_precision + 1.0 / NLD_recall)
    except:
        NLD_F1 = 0.0
    editor = [
        edit_precision, NLD_precision, edit_recall, NLD_recall, edit_F1, NLD_F1
    ]

    #illustrator
    illustrator = 0
    illustrator_extracted = 0
    i = 0
    illus_precision = 0.0
    illus_recall = 0.0
    NLD_precision = 0.0
    NLD_recall = 0.0
    for i in range(1, number_of_files):
        row = i
        filename = folder + "/" + "metadata_" + database[i][0] + ".json"
        with open(filename) as f:
            try:
                data = json.load(f)
                X = fbase(6)
                Y = "".join(fdata("dc.contributor.illustrator"))

                X_list = X.split("||")
                Y_list = Y.split("||")

                Common = list(set(X_list) & set(Y_list))
                if (len(Y_list) != 0):
                    illus_precision += len(Common) / len(Y_list)
                if (len(X_list) != 0):
                    illus_recall += len(Common) / len(X_list)

                #NLD_similarity calculation
                common_len = 0
                check_list = Y_list.copy()
                for i in range(len(X_list)):
                    for j in range(len(Y_list)):
                        if X_list[i] != '' and Y_list[j] != '':
                            if stringdist.levenshtein_norm(
                                    str(X_list[i]), str(Y_list[j])) <= .1:
                                common_len += 1
                                Y_list[j] = ''  #added
                                break
                Y_list = check_list.copy()
                if len(Y_list) != 0:
                    NLD_precision += common_len / len(Y_list)
                if (len(X_list) != 0):
                    NLD_recall += common_len / len(X_list)

                if X != "":
                    illustrator += 1
                if Y != "":
                    illustrator_extracted += 1
            except:
                pass
    illus_precision /= illustrator_extracted
    illus_recall /= illustrator
    NLD_precision /= illustrator_extracted
    NLD_recall /= illustrator
    try:
        illus_F1 = 2.0 * illus_precision * illus_recall / (illus_precision +
                                                           illus_recall)
    except:
        illus_F1 = 0.0
    try:
        NLD_F1 = 2.0 / (1.0 / NLD_precision + 1.0 / NLD_recall)
    except:
        NLD_F1 = 0.0
    illustrator = [
        illus_precision, NLD_precision, illus_recall, NLD_recall, illus_F1,
        NLD_F1
    ]

    #isbn
    isbn = 0
    isbn_extracted = 0
    isbn_extracted_correctly = 0
    for i in range(1, number_of_files):
        row = i
        filename = folder + "/" + "metadata_" + database[i][0] + ".json"
        with open(filename) as f:
            try:
                data = json.load(f)
                X = database[i][22]
                Y = "".join(fdata("dc.identifier.isbn"))
                if X != "":
                    isbn += 1
                if Y != "":
                    isbn_extracted += 1
                if X != "" and str(int(X)) == str(Y):
                    isbn_extracted_correctly += 1
            except:
                pass
    isbn_precision = isbn_extracted_correctly / isbn_extracted
    recall = isbn_extracted_correctly / isbn
    try:
        F1 = 2.0 * isbn_precision * recall / (isbn_precision + recall)
    except:
        F1 = 0.0
    isbn = [isbn_precision, recall, F1]

    #copyright
    copyright = 0
    copyright_extracted = 0
    copyright_extracted_correctly = 0
    for i in range(1, number_of_files):
        row = i
        filename = folder + "/" + "metadata_" + database[i][0] + ".json"
        with open(filename) as f:
            try:
                data = json.load(f)
                X = int(fbase(7))
                Y = fdata("dc.date.copyright")
                if X != "":
                    copyright += 1
                if Y != "":
                    copyright_extracted += 1
                if X != "" and str(int(X)) == str(Y):
                    copyright_extracted_correctly += 1
            except:
                pass
    copyright_precision = copyright_extracted_correctly / copyright_extracted
    recall = copyright_extracted_correctly / copyright
    try:
        F1 = 2.0 * copyright_precision * recall / (copyright_precision +
                                                   recall)
    except:
        F1 = 0.0
    copyright = [copyright_precision, recall, F1]

    #education Level
    Educational_level = 0
    Educational_level_extracted = 0
    Educational_level_extracted_correctly = 0
    i = 0
    for i in range(1, number_of_files):
        #i+=1
        row = i
        filename = folder + "/" + "metadata_" + database[i][0] + ".json"
        try:
            with open(filename) as f:
                data = json.load(f)
                X = fbase(57).lstrip('"')
                Y = fdata("dcterm.educationlevel")
                X = str(X).lower()
                Y = str(Y).lower()
                if X != "":
                    Educational_level += 1
                if Y != "":
                    Educational_level_extracted += 1
                if X is not None and X == Y:
                    Educational_level_extracted_correctly += 1
        except:
            pass
    Educational_level_precision = Educational_level_extracted_correctly / Educational_level_extracted
    recall = Educational_level_extracted_correctly / Educational_level
    try:
        F1 = 2.0 * Educational_level_precision * recall / (
            Educational_level_precision + recall)
    except:
        F1 = 0.0
    education_level = [Educational_level_precision, recall, F1]

    #DDC
    DDC = 0
    DDC_extracted = 0
    i = 1
    ddc_precision = 0.0
    ddc_recall = 0.0
    for i in range(1, number_of_files):
        row = i
        filename = folder + "/" + "metadata_" + database[i][0] + ".json"
        try:
            with open(filename, 'r') as f:
                data = json.load(f)
                X = fbase(43).lstrip('"')
                Y = fdata("dc.subject.ddc")
                X = str(X).lower()
                Y = str(Y).lower()

                #print ("DDC1: " + str(X_list) +  str(Y_list))
                X = re.findall("level1.*?{.*?}", X)[0]  #Take only level 1 DDC
                Y = re.findall("level1.*?{.*?}", Y)[0]  #Take only level 1 DDC
                #X1=re.findall("level1.*?{.*?}",X)[0]
                #Y1=re.findall("level1.*?{.*?}",Y)[0]
                #X2=re.findall("level2.*?{.*?}",X)[0]
                #Y2=re.findall("level2.*?{.*?}",Y)[0]
                #X3=re.findall("level3.*?{.*?}",X)[0]
                #Y3=re.findall("level3.*?{.*?}",Y)[0]

                X_list = re.findall(r"\d00", X)
                Y_list = re.findall(r"\d00", Y)

                #X1_list=re.findall(r"\d00",X1)
                #Y1_list=re.findall(r"\d00",Y1)
                #X2_list=re.findall(r"\d00",X2)
                #Y2_list=re.findall(r"\d00",Y2)
                #X3_list=re.findall(r"\d00",X3)
                #Y3_list=re.findall(r"\d00",Y3)
                #X_list = list(set(X1_list + X2_list + X3_list))
                #Y_list = list(set(Y1_list + Y2_list + Y3_list))

                #X_list=re.findall(r"'\d+': '.*?'",X)
                #Y_list=re.findall(r"'\d+': '.*?'",Y)

                Common = list(set(X_list) & set(Y_list))

                #print (database[i][0] + " DDC2--------->: " + str(X_list) +  str(Y_list))
                #print("Len of common, X_list, Y_list = " + str(len(Common)) + str(len(X_list)) + str(len(Y_list)))
                if (len(Y_list) != 0):
                    ddc_precision += len(Common) / len(Y_list)
                if (len(X_list) != 0):
                    ddc_recall += len(Common) / len(X_list)
                if len(X_list) != 0:
                    DDC += 1
                if len(Y_list) != 0:
                    DDC_extracted += 1
        except:
            pass
    ddc_precision /= DDC_extracted
    ddc_recall /= DDC
    try:
        ddc_F1 = 2.0 / (1.0 / ddc_precision + 1.0 / ddc_recall)
    except:
        ddc_F1 = 0.0
    ddc = [ddc_precision, ddc_recall, ddc_F1]

    #Contents
    content = 0
    content_extracted = 0
    content_extracted_correctly = 0
    cont_precision = 0.0
    cont_recall = 0.0
    for i in range(1, number_of_files):
        row = i
        filename = folder + "/" + "metadata_" + database[i][0] + ".json"
        try:
            with open(filename) as f:
                data = json.load(f)
                X = fbase(14)
                Y = fdata("dc.description.toc")
                X = str(X).lower()
                Y = str(Y).lower()
                X = re.sub("x\d+", "", X)
                for p in string.punctuation + "–" + "—" + "‘":
                    if p is not ",":
                        p = "\\" + str(p)
                        X = re.sub(p, " ", str(X).lower().strip())
                        Y = re.sub(p, " ", str(Y).lower().strip())
                X = re.sub("\s+", " ", X)
                Y = re.sub("\s+", " ", Y)

                X_list = X.split(',')
                Y_list = Y.split(',')
                X_list_1 = list()
                Y_list_1 = list()
                for elem in X_list:
                    elem = elem.strip()
                    X_list_1.append(elem)

                for elem in Y_list:
                    elem = elem.strip()
                    Y_list_1.append(elem)

                X_list = X_list_1
                Y_list = Y_list_1

                #print("FINAL CONTENTS ***-------------------------> (X, Y):")
                #print(X_list)
                #print(Y_list)
                Common = list(set(X_list) & set(Y_list))
                #print("LEN CONTENTS: Common: " + str(len(Common)) + ", X_list:" + str(len(X_list)) + ", Y_list: " + str(len(Y_list)))
                if (Y != ''):
                    cont_precision += len(Common) / len(Y_list)
                if (X != ''):
                    cont_recall += len(Common) / len(X_list)
                if X != '':
                    content += 1
                if Y != '':
                    content_extracted += 1
                if X is not None and str(X) == str(Y):
                    content_extracted_correctly += 1
        except:
            pass

    #print("Content extracted exactly correct = " + str(content_extracted_correctly) + ", total extracted TOCs = " + str(content_extracted) + ", total TOCs = " + str(content))
    #print("Content precision total = " + str(cont_precision) + ", Content recall total = " + str(cont_recall))
    cont_precision /= content_extracted
    cont_recall /= content
    try:
        cont_F1 = 2.0 * cont_precision * cont_recall / (cont_precision +
                                                        cont_recall)
    except:
        cont_F1 = 0.0
    content = [cont_precision, cont_recall, cont_F1]
    #print("Contents eval: " + str(content))

    perfMatrix1 = np.array([title, abstract, editor, illustrator], np.float)
    perfMatrix2 = np.array([content, isbn, copyright, education_level, ddc],
                           np.float)
    #Return the matrices
    d = dict()
    d['matrix1'] = perfMatrix1
    d['matrix2'] = perfMatrix2
    d['error'] = False
    return d
示例#23
0
 def test_levenshtein_norm_substitution(self):
     """It should return right normalized dist when substitution involved"""
     self.assertEqual(levenshtein_norm('abcd!', 'abcde'), 0.2)