示例#1
0
def calc_similarity( quest, parag ):
    quest_words = set(TextNormalizer.tokenize_words(quest))

    parag_words = set(TextNormalizer.tokenize_words(parag))
    parag_lemmas = set(TextNormalizer.tokenize_lemmas(parag))
    parag_stems = set(TextNormalizer.tokenize_stems(parag))
    parag_crops = set(TextNormalizer.tokenize_crops(parag))

    matched_parag_words = set()

    sim = 0.0
    for qword in quest_words:
        if qword in parag_words:
            matched_parag_words.add(qword)
            sim += 1.0
        else:
            qlemma = TextNormalizer.lemmatize_word(qword)
            if qlemma in parag_lemmas:
                #matched_parag_lemmas.add(qlemma)
                sim += 1.0
            else:
                qstem = TextNormalizer.stem_word(qword)
                if qstem in parag_stems:
                    sim += 0.95
                else:
                    qcrop = TextNormalizer.crop_word(qword)
                    if qcrop in parag_crops:
                        sim += 0.80
                    else:
                        found_syn = False
                        for pstem in parag_stems:
                            if (qstem,pstem) in syn_stems:
                                sim += 0.70
                                found_syn = True
                                break

                        if not found_syn:
                            if qword in w2v:
                                qvec = w2v[qword]
                                max_cos = -1e38
                                for pword in parag_words:
                                    if pword in w2v:
                                        pvec = w2v[pword]
                                        c = v_cosine( qvec, pvec )
                                        max_cos = max( max_cos, c )

                                sim += max_cos*0.5

    return sim / len(quest_words)
示例#2
0
def filter_NEs(tokens):
    return [ TextNormalizer.crop_word(word) for word in filter( is_NE, tokens ) ]