def calculate_pearson_for_sts131(stemming, lowercase, stopwords,
                                 remove_notalpha, analyze_measure):
    stss131_df = pd.read_csv("./data/stss131.csv", sep=",")

    # First we run through the dataset row by row, and append the results to a list
    list_of_similarity_values = []
    for s1, s2 in zip(stss131_df.Sentence1, stss131_df.Sentence2):
        if analyze_measure == "original":
            similarity_score = wni.calculate_path_similarity_for_sentences(
                sentence1=s1,
                sentence2=s2,
                stemming=stemming,
                lowercase=lowercase,
                stopwords=stopwords,
                remove_notalpha=remove_notalpha)
            print("Similarity score of the current sentence pair is:",
                  similarity_score)

        elif analyze_measure == "hyp-ed method":
            similarity_score = csm.count_custom_similarity_measure(
                sentence1=s1,
                sentence2=s2,
                stemming=stemming,
                lowercase=lowercase,
                stopwords=stopwords,
                remove_notalpha=remove_notalpha)
            print("Similarity score of the current sentence pair is:",
                  similarity_score)

        elif analyze_measure == "semantic text similarity method":
            sentence1_words = preprocess(s1, stemming, lowercase, stopwords,
                                         remove_notalpha)

            sentence2_words = preprocess(s2, stemming, lowercase, stopwords,
                                         remove_notalpha)

            s1_word_list, s2_word_list = [], []

            for word1 in sentence1_words:
                s1_word_list.append(word1[0])

            for word2 in sentence2_words:
                s2_word_list.append(word2[0])

            similarity_score = soc.soc_pmi(s1_word_list, s2_word_list)
            print("Similarity score of the current sentence pair is:",
                  similarity_score)

        list_of_similarity_values.append(similarity_score)

    stss131_df["R"] = pd.Series(list_of_similarity_values)
    pearson_corr = pearsonr(stss131_df.X, stss131_df.R)
    print("Pearson correlation between Rater mean: {corr}, p-value {pval}".
          format(corr=pearson_corr[0], pval=pearson_corr[1]))
def calculate_path_similarity_for_sentences(sentence1, sentence2, stemming,
                                            lowercase, stopwords,
                                            remove_notalpha):
    sentence1_words = preprocess(sentence=sentence1,
                                 use_stemmer=stemming,
                                 use_lowercase=lowercase,
                                 use_stopwords=stopwords,
                                 remove_nonalpha=remove_notalpha)
    sentence2_words = preprocess(sentence=sentence2,
                                 use_stemmer=stemming,
                                 use_lowercase=lowercase,
                                 use_stopwords=stopwords,
                                 remove_nonalpha=remove_notalpha)

    sentence1_score = 0
    sentence2_score = 0

    for word1 in sentence1_words:
        max_score = 0
        word1_synsets = retrieve_synset_list_for_word(word1[0])
        for word2 in sentence2_words:
            word2_synsets = retrieve_synset_list_for_word(word2[0])
            score = calculate_path_similarity_for_synset_lists(
                word1_synsets, word2_synsets)
            if score > max_score:
                max_score = score
        sentence1_score += max_score

    for word2 in sentence2_words:
        max_score = 0
        word2_synsets = retrieve_synset_list_for_word(word2[0])
        for word1 in sentence1_words:
            word1_synsets = retrieve_synset_list_for_word(word1[0])
            score = calculate_path_similarity_for_synset_lists(
                word2_synsets, word1_synsets)
            if score > max_score:
                max_score = score
        sentence2_score += max_score

    # Count the means for each sentence, add them together and divide by number of sentences
    sentence1_score_mean = sentence1_score / len(sentence1_words)
    sentence2_score_mean = sentence2_score / len(sentence2_words)
    sentence_similarity_score = (sentence1_score_mean +
                                 sentence2_score_mean) / 2
    return round(sentence_similarity_score, 2)
def test_preprocessing_remove_stopwords():
    test_sentence = "This is A TEST SENtence, to Make Sure Lower works"
    result_sentence = ["TEST", "SENtence", "Make", "Sure", "Lower", "works"]
    preprocessed_sentence = swp.preprocess(sentence=test_sentence,
                                           use_stemmer=False,
                                           use_lowercase=False,
                                           use_stopwords=True,
                                           remove_nonalpha=False)
    preprocessed_sentence_just_words = [
        word[0] for word in preprocessed_sentence
    ]
    assert result_sentence == preprocessed_sentence_just_words
def test_preprocessing_tolower_remove_nonalpha():
    test_sentence = "This is A TEST SENtence, to Make Sure Lower works"
    result_sentence = [
        "this", "is", "a", "test", "sentence", "to", "make", "sure", "lower",
        "works"
    ]
    preprocessed_sentence = swp.preprocess(sentence=test_sentence,
                                           use_stemmer=False,
                                           use_lowercase=True,
                                           use_stopwords=False,
                                           remove_nonalpha=True)
    preprocessed_sentence_just_words = [
        word[0] for word in preprocessed_sentence
    ]
    assert result_sentence == preprocessed_sentence_just_words
Пример #5
0
                                                                                stemming=args.stem,
                                                                                lowercase=args.lowercase,
                                                                                stopwords=args.stopwords,
                                                                                remove_notalpha=args.nonalpha)
        print("Sentence similarities: " + str(sentence_similarity_score))
    elif args.measure == "hyp-ed method":
        logging.info("Calculating path similarity for sentences using hypernyms and hyponyms.")
        sentence_similarity_score = csm.count_custom_similarity_measure(sentence1=args.sentence1,
                                                                        sentence2=args.sentence2,
                                                                        stemming=args.stem, lowercase=args.lowercase,
                                                                        stopwords=args.stopwords,
                                                                        remove_notalpha=args.nonalpha)
        print("Sentence similarities: " + str(sentence_similarity_score))
    elif args.measure == "semantic text similarity method":
        logging.info("Calculating path similarity for sentences using SOC PMI Alogrithm.")
        sentence1_words = preprocess(sentence=args.sentence1, use_stemmer=args.stem, use_lowercase=args.lowercase,
                                     use_stopwords=args.stopwords, remove_nonalpha=args.nonalpha)

        sentence2_words = preprocess(sentence=args.sentence2, use_stemmer=args.stem, use_lowercase=args.lowercase,
                                     use_stopwords=args.stopwords, remove_nonalpha=args.nonalpha)

        s1_word_list, s2_word_list = [], []

        for word1 in sentence1_words:
            s1_word_list.append(word1[0])

        for word2 in sentence2_words:
            s2_word_list.append(word2[0])

        sentence_similarity_score = soc.soc_pmi(s1_word_list, s2_word_list)
        print("Sentence similarities: " + str(sentence_similarity_score))