def demo(scorer=None, compare_scorer=None): """Finds trigram collocations in the files of the WebText corpus.""" from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores if scorer is None: scorer = BigramAssocMeasures.likelihood_ratio if compare_scorer is None: compare_scorer = BigramAssocMeasures.raw_freq from nltk.corpus import stopwords, webtext ignored_words = stopwords.words('english') word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words for file in webtext.fileids(): words = [word.lower() for word in webtext.words(file)] cf = BigramCollocationFinder.from_words(words) cf.apply_freq_filter(3) cf.apply_word_filter(word_filter) print(file) print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)]) print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, spearman_correlation( ranks_from_scores(cf.score_ngrams(scorer)), ranks_from_scores(cf.score_ngrams(compare_scorer)))))
def demo(scorer=None, compare_scorer=None): """Finds bigram collocations in the files of the WebText corpus.""" from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores if scorer is None: scorer = BigramAssocMeasures.likelihood_ratio if compare_scorer is None: compare_scorer = BigramAssocMeasures.raw_freq from nltk.corpus import stopwords, webtext ignored_words = stopwords.words('english') word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words for file in webtext.fileids(): words = [word.lower() for word in webtext.words(file)] cf = BigramCollocationFinder.from_words(words) cf.apply_freq_filter(3) cf.apply_word_filter(word_filter) print(file) print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)]) print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, spearman_correlation( ranks_from_scores(cf.score_ngrams(scorer)), ranks_from_scores(cf.score_ngrams(compare_scorer)))))
def test(self, inFile, embFile="emb_art_10.npy"): self.cos_dict = dict() self.cos_dict_id = dict() # 1. Import wordsim353 and visualize it csv = pd.read_csv(inFile) csv = np.array(csv) idsim = dict() wordsim = dict() for (word_a, word_b, num) in csv: if word_a in self.data.word2id and word_b in self.data.word2id: idsim[(self.data.word2id[word_a], self.data.word2id[word_b])] = num wordsim[(word_a, word_b)] = num # 2. Load embeddings & normalize them if not self.skip_gram_model.v_embeddings: self.embeddings = np.load(embFile, allow_pickle=True) else: self.embeddings = self.skip_gram_model.v_embeddings.weight.cpu( ).data.numpy() # 3. Compute Cosine Similarities for (id_a, id_b), value in idsim.items(): embeddings_a = self.embeddings[id_a].reshape(1, -1) embeddings_b = self.embeddings[id_b].reshape(1, -1) similarity = np.asscalar( cosine_similarity(embeddings_a, embeddings_b)[0]) self.cos_dict[(self.data.id2word[id_a], self.data.id2word[id_b])] = similarity self.cos_dict_id[id_a, id_b] = similarity # Array form a = list([]) b = list([]) for (id_a, id_b), value in idsim.items(): a.append(value) b.append(self.cos_dict_id[(id_a, id_b)]) print("Spearman Coefficient:", spearman_correlation(self.cos_dict_id, idsim)) spear = spearmanr(a, b) print(spear) return (spear[0])
def wordsim353_spearman(self, input_filename): target_word = [] context_word = [] human_scores = [] with open(input_filename) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') ws353_pairs = -1 for row in csv_reader: if ws353_pairs == -1: ws353_pairs += 1 else: target_word.append(row[0]) context_word.append(row[1]) human_scores.append(float(row[2])) ws353_pairs += 1 for pair in range(0, ws353_pairs): if (target_word[pair] not in self.data.word2id): raise Exception('Target word not in model vocab: ', target_word[pair]) if (context_word[pair] not in self.data.word2id): raise Exception('Context word not in model vocab: ', context_word[pair]) human_rankings = ss.rankdata(human_scores) machine_scores = [] for pair in range(0, len(human_scores)): machine_scores.append( self.calculate_probability(target_word[pair], context_word[pair])) machine_rankings = ss.rankdata(machine_scores) human_scores_dict = dict() machine_scores_dict = dict() for pair in range(0, len(human_scores)): human_scores_dict[pair] = human_rankings[pair] machine_scores_dict[pair] = machine_rankings[pair] return spearman.spearman_correlation(human_scores_dict, machine_scores_dict)
from nltk.collocations import BigramAssocMeasures from nltk import FreqDist from nltk import bigrams from nltk.metrics import spearman analyzer = MorphAnalyzer() corpus = pd.read_csv("court-V-N.csv", header=None) measures = BigramAssocMeasures() tagger = lambda x: (x, analyzer.parse(x.lower().strip())[0].tag.POS) tagged_corpus = corpus.applymap(tagger).drop(0, axis=1) with open("gold_standard.txt", "r") as io: standard = [tuple(x.split()) for x in io.readlines()] wfd = FreqDist(tagged_corpus.values.flatten()) bfd = FreqDist(bigrams(tagged_corpus.values.flatten())) finder_1 = BigramCollocationFinder(wfd, bfd) filter = lambda x: [tuple(z[0] for z in y[0]) for y in x if y[0][0][1] == "INFN"] scored_pmi = filter(finder_1.score_ngrams(measures.pmi)) scored_student = filter(finder_1.score_ngrams(measures.student_t)) pmi_top = scored_pmi[:10] student_top = scored_student[:10] for name, top in [("pmi_top10.txt", pmi_top), ("student_top10.txt", student_top)]: with open(name, "w") as io: joined = [" ".join(x) + "\n" for x in top] io.writelines(joined) print(spearman.spearman_correlation(pmi_top, student_top)) print("Done")