Exemplo n.º 1
0
def calculate_total_similarity(lsi, word2id = {}, context=[], blanks=[]):
    """calcualte the score given context words and blanks"""
    score = 0.0
    cnt = 0.0
    for i in range(0, len(blanks)):
        for em in blanks[i]:
            if word2id.has_key(em):
                em_vec = lsi.projection.u[int(word2id[em])]

                for j in range(0, len(blanks)):
                    if j != i:
                        for em2 in blanks[j]:
                            if word2id.has_key(em2):
                                cnt = cnt + 1
                                word_vec = lsi.projection.u[int(word2id[em2])];
                                score = score + mathutils.cossim(em_vec, word_vec)

                for word in context:
                    if word2id.has_key(word):
                        cnt = cnt + 1
                        word_vec = lsi.projection.u[int(word2id[word])]
                        score = score + mathutils.cossim(em_vec, word_vec)
    if cnt != 0:
        score = score / cnt
    return score;
Exemplo n.º 2
0
def calculate_total_similarity_with_rake(lsi,text, word2id = {}, blanks=[]):
    """calculate total similarity between the blanks and keywords identified by rake"""
    score = 0.0
    cnt = 0.0
    keywords = rake_obj.run(text)
    keyword_dict = {}
    # build the key word dictionary with the key is the token and value is the corresponding value
    for key in keywords:
        words = nltk.word_tokenize(key[0])
        for word in words:
            keyword_dict[word] = key[1]

    # print keyword_dict
    weight_sum = 0.0
    for blank in blanks:
        for em in blank:
            if word2id.has_key(em):
                em_vec = lsi.projection.u[int(word2id[em])]
                for key in keyword_dict.keys():
                    if word2id.has_key(key):
                        cnt = cnt + 1
                        score = score + mathutils.cossim(em_vec, lsi.projection.u[int(word2id[key])]) * keyword_dict[key]
                        # weight_sum = weight_sum + keyword_dict[key]

    if cnt != 0:
        score = score / (cnt)
    return score;
Exemplo n.º 3
0
def calculate_total_similarity_by_k_max(lsi, word2id = {}, context=[], blanks=[], k=2):
    """calcualte the score given context words and blanks"""
    score = 0.0
    queue = []
    for i in range(0, len(blanks)):
        for em in blanks[i]:
            if word2id.has_key(em):
                em_vec = lsi.projection.u[int(word2id[em])]
                # loop all other blanks
                for j in range(0, len(blanks)):
                    if j != i:
                        for em2 in blanks[j]:
                            if word2id.has_key(em2):
                                word_vec = lsi.projection.u[int(word2id[em2])];
                                heapq.heappush(queue, mathutils.cossim(em_vec, word_vec))
                            else:
                                unseen_word.add(em2)

                for word in context:
                    if word2id.has_key(word):
                        word_vec = lsi.projection.u[int(word2id[word])]
                        heapq.heappush(queue, mathutils.cossim(em_vec, word_vec))
                    else:
                        unseen_word.add(word)
            else:
                unseen_word.add(em)
    kmax = heapq.nlargest(k, queue)
    for i in range(0, len(kmax)):
        if i < k:
            score = score + kmax[i]
        else:
            break
    if len(kmax) > k:
        score = score / k
    elif len(kmax) != 0:
        score = score / len(kmax)
    return score;
Exemplo n.º 4
0
def calculate_total_similarity_with_combination(lsi, word2id = {}, context=[], blanks=[], alpha=0.5):
    """calcualte the score given context words and blanks"""
    score = 0.0
    blank_vecs = []
    score_per_blank = []
    for blank in blanks:
        vector = [];
        for em in blank:
            if my_dic.has_key(em.lower()):
                vector.append(model[em.lower()])
            else:
                unseen_word.add(em)
        if len(vector)  > 0:
            # print vector
            blank_vecs.append(mathutils.combine_and_normalize(vector, len(vector[0])))
    
    if len(blank_vecs) == 0:
        return score
    
    for vec in blank_vecs:
        cnt = 0;
        temp_score = 0.0;
        for word in context:
            if my_dic.has_key(word.lower()):
                cnt = cnt + 1
                word_vec = model[word.lower()]
                temp_score = temp_score + mathutils.cossim(vec, word_vec)
            else:
                unseen_word.add(word)
        if cnt != 0:
            score_per_blank.append(temp_score/cnt)
    
    if len(score_per_blank) == 2:
        score = alpha * score_per_blank[0] + (1 - alpha) * score_per_blank[1]
    elif len(score_per_blank) == 1:
        score = score_per_blank[0]
    return score;