def get_ic(ic_str): def into_corpus(dirstr): import pandas as pd from nltk.tokenize import word_tokenize from nltk.corpus.reader.api import concat def files(s): for dir, dirs, files in os.walk(s): for f in files: yield os.path.join(dir, f) def chunks(fp): if fp.endswith(".txt"): print("reading in text file as document: " + fp) with open(fp) as f: return [f.read()] elif fp.endswith(".csv.gz"): print("reading in file as tweet documents: " + fp) return pd.read_csv(fp, compression="gzip").message.values return concat( word_tokenize(c) for f in files(dirstr) for c in chunks(f)) if os.path.exists(ic_str): if os.path.isdir(ic_str): print("Assuming path leads to EITHER txt or twitter csv.gz files") return wn.ic(into_corpus(ic_str), False, 0.) elif ic_str.endswith(".dat"): # assume this is a wordnet corpus. return wn.WordNetICCorpusReader(ic_str) else: raise NotImplementedError elif type(ic_str) == str: if ic_str == "brown": try: from nltk.corpus import brown except LookupError: import nltk nltk.download('brown') from nltk.corpus import webtext return wn.ic(brown, False, 0.) elif ic_str == "web": try: from nltk.corpus import webtext except LookupError: import nltk nltk.download('webtext') from nltk.corpus import webtext return wn.ic(webtext, False, 0.) else: raise NotImplementedError else: raise NotImplementedError
def print_semantic_similarity(word1, word2): print('Printing path similarity') print(str(wn.synsets(word1)[0].path_similarity(wn.synsets(word2)[0]))) print('Printing Leacock-Chodorow similarity') print(str(wn.synsets(word1)[0].lch_similarity(wn.synsets(word2)[0]))) print('Printing Wu-Palmer similarity') print(str(wn.synsets(word1)[0].wup_similarity(wn.synsets(word2)[0]))) ''' Requires information content ''' brown_ic = wordnet_ic.ic('ic-brown.dat') semcor_ic = wordnet_ic.ic('ic-semcor.dat') genesis_ic = wn.ic(genesis, False, 0.0) print('Printing Resnik similarity') print('Brown information content: ' + str( wn.synsets(word1)[0].res_similarity(wn.synsets(word2)[0], brown_ic))) print('Genesis information content: ' + str( wn.synsets(word1)[0].res_similarity(wn.synsets(word2)[0], genesis_ic))) print('Printing Jiang-Conrath similarity') print('Brown information content: ' + str( wn.synsets(word1)[0].jcn_similarity(wn.synsets(word2)[0], brown_ic))) print('Genesis information content: ' + str( wn.synsets(word1)[0].jcn_similarity(wn.synsets(word2)[0], genesis_ic))) print('Printing Lin similarity') print('Semcor information content: ' + str( wn.synsets(word1)[0].lin_similarity(wn.synsets(word2)[0], semcor_ic)))
def run(self): ## start operations stime = time.time() self.printer.mainTitle("Thesauto - automated creation of a thesaurus using WordNet") ## read file self.printer.stage(1, 4, "Extracting words from input file") self.printer.info("File: " + basename(self.inputFile)) words = self.extractwords() self.printer.lines(words, max=20, title="-- Extracted " + str(len(words)) + " words --") ## prepare WordNet IC self.printer.stage(2, 4, "Preparing WordNet IC (Information Content)") if not self.database: IC = wordnet.ic(brown, False, 1.0) else: IC = None self.printer.info("Using base WordNet thesaurus as a database instead. Skipped.") self.printer.info("Base: " + basename(self.database)) ## create a thesaurus for each set of words self.printer.stage(3, 4, "Building thesaurus") thesaurus = self.buildWordnetThesaurus(words, IC) self.printer.lines(thesaurus, max=10, line_max=75, title="-- Built thesaurus --") ## save the final thesaurus self.printer.stage(4, 4, "Saving full thesaurus") open(self.outputFile, 'w').write( \ ''.join([self.setToString(set) for set in thesaurus])) self.printer.info(self.outputFile + " written.") etime = time.time() self.printer.info("Execution took " + str(etime-stime) + " seconds\n")
def get_sim(word1, word2, similarity='path', combine='max'): s1 = wn.synsets(word1) s2 = wn.synsets(word2) if similarity == 'path': vals = np.array([x.path_similarity(y) for x, y in product(s1, s2)], dtype=float) elif similarity == 'lch': vals = np.array([x.lch_similarity(y) for x, y in product(s1, s2)], dtype=float) elif similarity == 'wup': vals = np.array([x.wup_similarity(y) for x, y in product(s1, s2)], dtype=float) elif similarity == 'res': from nltk.corpus import reuters ic = wn.ic(reuters, False, 0.0) vals = np.array([x.res_similarity(y, ic) for x, y in product(s1, s2)], dtype=float) elif similarity == 'jcn': from nltk.corpus import reuters ic = wn.ic(reuters, False, 0.0) vals = np.array([x.jcn_similarity(y, ic) for x, y in product(s1, s2)], dtype=float) elif similarity == 'lin': from nltk.corpus import reuters ic = wn.ic(reuters, False, 0.0) vals = np.array([x.lin_similarity(y, ic) for x, y in product(s1, s2)], dtype=float) if combine == 'max': return np.nanmax(vals) elif combine == 'mean': return np.nanmean(vals) elif combine == 'min': return np.nanmin(vals) return 0
def get_corpus_and_ic(set_of_strings, file_corpus_output): if os.path.exists(file_corpus_output): os.remove(file_corpus_output) with open(file_corpus_output, "w+", encoding='utf8') as file_corpus: for token in set_of_strings: #file_corpus.write(token.replace('\n', '') + '\n') file_corpus.write(token + '\n') corpus = PlaintextCorpusReader( file_corpus_output.split('/')[0], file_corpus_output.split('/')[1]) corpus_ic = wn.ic(corpus, False, 0.0) return corpus, corpus_ic
from __future__ import print_function import sys from composes.utils import io_utils, scoring_utils from composes.similarity.cos import CosSimilarity from nltk.corpus import wordnet as wn from nltk.corpus import brown from nltk.corpus import wordnet_ic from itertools import combinations print("Loading brown IC") brown_ic = wn.ic(brown, False, 0.0) print("done") def getss(word): return wn.synsets(word,'n')[0] def wn_sim(ss1,ss2): print("comparing",ss1,"with",ss2) return wn.path_similarity(ss1,ss2) def jcn_sim(ss1,ss2): return ss1.jcn_similarity(ss2, brown_ic) def lin_sim(ss1,ss2): return ss1.lin_similarity(ss2, brown_ic) def res_sim(ss1,ss2): return ss1.res_similarity(ss2, brown_ic) def wup_sim(ss1,ss2): return ss1.wup_similarity(ss2, brown_ic) def lch_sim(ss1,ss2): return ss1.lch_similarity(ss2, brown_ic) def mean(seq): print(sum(seq) / len(seq)) return sum(seq) / len(seq) def is_better(ingredients, result, other):
import numpy as np import nltk from nltk.corpus import wordnet as wn from nltk.corpus import brown _brown_ic = wn.ic(brown) def nn_suggestion(phrase): """Suggest the name in a phrase """ if " " not in phrase: return names = [n for (n, t) in nltk.pos_tag(phrase.split()) if t == 'NN'] if len(names) > 1: print('Warning: "{}" has multiple NN to choose from'.format(phrase)) for nn in names: yield nn def name_suggestions(phrase): """Go through the name suggestions """ yield phrase if ' ' in phrase: yield '_'.join(phrase.split()) # Wordnet substitutes space with underscore yield phrase.replace(" ", "") for nn in nn_suggestion(phrase): yield nn
#Imports import sys import re import numpy as np import pandas as pd import time import csv from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic brown_ic = wordnet_ic.ic('ic-brown.dat') semcor_ic = wordnet_ic.ic('ic-semcor.dat') from nltk.corpus import genesis genesis_ic = wn.ic(genesis, False, 0.0) #Creates keywords list from formed CSV file with sysReview.py def create_list(keywords_file): with open(keywords_file, 'rb') as csvfile: data_CSV = csv.reader(csvfile, delimiter=',') words, path, definition = [], [], [] for row in data_CSV: words.append(row[0]) path.append(row[1]) definition.append(row[2]) all = len(row) - 2 words = filter(None, words) #Removes empty items in list path = filter(None, path) definition = filter(None, definition) return (words, all)
for doc in corpus: corpus_dict.append(dict(doc)) dictlen = len(dictionary) tc = WordNetEvaluator() tc_means = [] tc_medians = [] words_list = [] ofilemean = open(dname + "/"+tcmethod+"_mean_rand_"+str(word_count)+".txt", "w") ofilemedian = open(dname + "/"+tcmethod+"_median_rand_"+str(word_count)+".txt", "w") if ic: if dname == "reuters_LDA": src_ic = wn.ic(reuters, False, 0.0) else: src_ic = wn.ic(brown, False, 0.0) for i in range(sample_times): random_words = [] # generate random numbers for n in range(word_count): word = random.randint(1, dictlen-1) while word in random_words: word = random.randint(0, dictlen-1) random_words.append(word) keylist = []
import nltk from nltk.corpus import wordnet from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic brown_ic = wordnet_ic.ic('ic-brown.dat') semcor_ic = wordnet_ic.ic('ic-semcor.dat') from nltk.corpus import genesis genesis_ic = wn.ic(genesis, False, 0.0) lion = wn.synset('lion.n.01') cat = wn.synset('cat.n.01') print(lion.res_similarity(cat, brown_ic)) print(lion.res_similarity(cat, genesis_ic)) print(lion.jcn_similarity(cat, brown_ic)) print(lion.jcn_similarity(cat, genesis_ic)) print(lion.lin_similarity(cat, semcor_ic))
print(bad.name() == 'bad') print(bad.synset().definition() == 'having undesirable or negative qualities') print('=====================================') print('Calculating WordNet Synset Similarity') print('=====================================') lion = wordnet.synset('lion.n.01') print(lion.path_similarity(cat)) # print(lion.lch_similarity(cat)) print(lion.wup_similarity(cat)) brown_ic = wordnet_ic.ic('ic-brown.dat') semcor_ic = wordnet_ic.ic('ic-semcor.dat') genesis_ic = wordnet.ic(genesis, False, 0.0) print(lion.res_similarity(cat, brown_ic)) print(lion.res_similarity(cat, genesis_ic)) print(lion.jcn_similarity(cat, brown_ic)) print(lion.jcn_similarity(cat, genesis_ic)) print(lion.lin_similarity(cat, semcor_ic)) cb = wordnet.synset('cookbook.n.01') ib = wordnet.synset('instruction_book.n.01') print(cb.wup_similarity(ib) == 0.91666666666666663) ref = cb.hypernyms()[0] print(cb.shortest_path_distance(ref) == 1) print(ib.shortest_path_distance(ref) == 1) print(cb.shortest_path_distance(ib) == 2)
def __init__(self): self.prep = Preprocessor() self.genesis_ic = wn.ic(genesis, False, 0.0)
import sqlite3 import sys import random import numpy as np from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic from nltk.corpus import genesis from scipy import stats # information content references BROWN_IC = wordnet_ic.ic('ic-brown.dat') SEMCOR_IC = wordnet_ic.ic('ic-semcor.dat') GENESIS_IC = wn.ic(genesis, False, 0.0) class Similarity_Test: def __init__(self, sample_size, ic): self.sample_size = sample_size self.ic = ic self.overlap = [] self.path = [] self.wup = [] # Don't use the following similarity metrics because they only supports comparing words with same POS # self.lch = [] # self.res = [] # self.jcn = [] # self.lin = [] self._scores_over_sample() def _load_sdr(self, word):
#jcn_similarity test from nltk.corpus import wordnet from nltk.corpus import wordnet_ic from nltk.corpus import genesis brown_ic = wordnet_ic.ic('ic-brown.dat') genesis_ic = wordnet.ic(genesis,False,0.0) semcor_ic = wordnet_ic.ic('ic-semcor.dat') list1 = ['flight'] list2 = ['trip'] list3 = [] for word1 in list1: for word2 in list2: wordFromList1 = wordnet.synsets(word1,pos=wordnet.NOUN) wordFromList2 = wordnet.synsets(word2,pos=wordnet.NOUN) print wordFromList1 ,"\n", wordFromList2 if wordFromList1 and wordFromList2: for item1 in wordFromList1: for item2 in wordFromList2: s=item1.jcn_similarity(item2,brown_ic) print(item1, item2, s) list3.append(s) print(max(list3)) print list3
print(hit.lch_similarity(slap)) # doctest: +ELLIPSIS print(wn.lch_similarity(hit, slap)) # doctest: +ELLIPSIS print(hit.lch_similarity(slap, simulate_root=False)) print(wn.lch_similarity(hit, slap, simulate_root=False)) print(dog.wup_similarity(cat)) # doctest: +ELLIPSIS print(hit.wup_similarity(slap)) print(wn.wup_similarity(hit, slap)) print(hit.wup_similarity(slap, simulate_root=False)) print(wn.wup_similarity(hit, slap, simulate_root=False)) # import nltk # nltk.download('wordnet_ic') brown_ic = wordnet_ic.ic('ic-brown.dat') semcor_ic = wordnet_ic.ic('ic-semcor.dat') # import nltk # nltk.download('genesis') genesis_ic = wn.ic(genesis, False, 0.0) print(dog.res_similarity(cat, brown_ic)) # doctest: +ELLIPSIS print(dog.res_similarity(cat, genesis_ic)) # doctest: +ELLIPSIS print(dog.jcn_similarity(cat, brown_ic)) # doctest: +ELLIPSIS print(dog.jcn_similarity(cat, genesis_ic)) # doctest: +ELLIPSIS print(dog.lin_similarity(cat, semcor_ic)) # doctest: +ELLIPSIS # access to all synsets for synset in list(wn.all_synsets('n'))[:10]: print(synset) print(wn.synsets('dog')) # doctest: +ELLIPSIS print(wn.synsets('dog', pos='v')) for synset in islice(wn.all_synsets('n'), 5): print(synset, synset.hypernyms()) # morphy print(wn.morphy('denied', wn.NOUN)) print(wn.morphy('denied', wn.VERB))