def similarity(word1, word2, tag): obj1 = wn.synset(word1 + "."+ tag+".01") obj2 = wn.synset(word2 + "."+ tag+".01") #print(obj1) brown_ic = wordnet_ic.ic('ic-brown.dat') # Information content semcor_ic = wordnet_ic.ic('ic-brown.dat') value = obj1.res_similarity(obj2, brown_ic) return value
def test_wordnet_similarities(self): # Path based similarities. self.assertAlmostEqual(S('cat.n.01').path_similarity(S('cat.n.01')), 1.0) self.assertAlmostEqual(S('dog.n.01').path_similarity(S('cat.n.01')), 0.2) self.assertAlmostEqual(S('dog.n.01').lch_similarity(S('cat.n.01')), 2.028, places=3) self.assertAlmostEqual(S('dog.n.01').wup_similarity(S('cat.n.01')), 0.8571, places=3) # Information Content similarities. brown_ic = wnic.ic('ic-brown.dat') self.assertAlmostEqual(S('dog.n.01').jcn_similarity(S('cat.n.01'), brown_ic), 0.4497, places=3) semcor_ic = wnic.ic('ic-semcor.dat') self.assertAlmostEqual(S('dog.n.01').lin_similarity(S('cat.n.01'), semcor_ic), 0.8863, places=3)
def _other_recognition(self, tagged_sentences, all_entities, question): # Nouns retrieval nouns = [] for sentence in tagged_sentences: nouns += filter(lambda x: x[1] == "NN", sentence) nouns = [noun for (noun, tag) in nouns] # Nouns filtering # Remove all entities that are nouns all_entities = set(itertools.chain(*map(str.split, all_entities))) nouns = [noun for noun in nouns if noun not in all_entities] features = QuestionClassifier.get_features(question.text, "hn") head = features["head"] if head == "": return nouns # Filter nouns with WordNet synsets try: threshold = float(MyConfig.get("answer_extraction", "other_threshold")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) threshold = 0.6 try: ic = wordnet_ic.ic(MyConfig.get("answer_extraction", "ic")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) ic = wordnet_ic.ic("ic-bnc.dat") result = [] head_synsets = wn.synsets(head, pos=wn.NOUN) if len(head_synsets) == 0: noun_synsets = wn.synsets(features["noun"], pos=wn.NOUN) if len(noun_synsets) == 0: return nouns else: head_synset = noun_synsets[0] else: head_synset = head_synsets[0] for noun in nouns: try: noun_synset = wn.synsets(noun, pos=wn.NOUN)[0] if threshold < noun_synset.lin_similarity(head_synset, ic) < 0.9: result.append(noun) except IndexError: continue return result
def test(): col = nltk.TextCollection(nltk.corpus.brown) brown_ic = wordnet_ic.ic('ic-brown.dat') sc = SimilarityCalculator(col, 'bp', brown_ic) sentence1 = preprocess("The jurors were taken into the courtroom in groups of 40 and asked to fill out a questionnaire.") sentence2 = preprocess("About 120 potential jurors were being asked to complete a lengthy questionnaire.") print sc.similarity_bidirectional(sentence1, sentence2)
def __init__ (self, sim_threshold = 0.1, sim_weight = 1, **kwds): global brown_ic super().__init__(**kwds) if not brown_ic: brown_ic = wordnet_ic.ic('ic-brown.dat') self.__threshold = sim_threshold self.__weight = sim_weight
def get_similarity(self, synsets1, synsets2): brown_ic = wordnet_ic.ic("ic-brown.dat") max_value = 0 for synset1 in synsets1: for synset2 in synsets2: value = wn.res_similarity(synset1, synset2, brown_ic) if value > max_value: max_value = value return max_value
def sensesim(ss1,ss2,metric): if metric=='path': sim=ss1.path_similarity(ss2) elif metric=='lin': sim=ss1.lin_similarity(ss2,wn_ic.ic('ic-brown.dat')) elif metric=='jcn': sim=ss1.jcn_similarity(ss2,wn_ic.ic('ic-brown.dat')) elif metric=='res': sim=ss1.res_similarity(ss2,wn_ic.ic('ic-brown.dat')) elif metric=='lch': sim=ss1.lch_similarity(ss2) elif metric=='wup': sim=ss1.wup_similarity(ss2) else: print "Unknown metric", metric sim=0 return sim
def __init__(self,parameters): self.parameters=parameters self.wn_sim=self.parameters.get("wn_sim",Analyser.simmetric) self.ic=wn_ic.ic('ic-semcor.dat') self.candidates={} self.synsetthresh=self.parameters.get("synset_thresh",Analyser.synsetthresh) self.totalthresh=self.parameters.get("total_thresh",Analyser.totalthresh) self.propthresh=self.parameters.get("prop_thresh",Analyser.propthresh) self.simthresh=self.parameters.get("sim_thresh",Analyser.simthresh)
def get_lin_distance(self, word1, word2): brown_ic = wordnet_ic.ic('ic-brown.dat') if len(wn.synsets(word1)) == 0 or len(wn.synsets(word2)) == 0: return 0 target1 = wn.synsets(word1)[0] target2 = wn.synsets(word2)[0] try: result = target1.lin_similarity(target2, brown_ic) return result except: return 0
def similarity_by_path(sense1, sense2, option="path"): """ Returns maximum path similarity between two senses. """ if option.lower() in ["path", "path_similarity"]: # Path similaritys return max(wn.path_similarity(sense1,sense2), wn.path_similarity(sense1,sense2)) elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer return wn.wup_similarity(sense1, sense2) elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow if sense1.pos != sense2.pos: # lch can't do diff POS return 0 return wn.lch_similarity(sense1, sense2) return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
def single_jiang_conrath(cast_no1, cast_no2, syn_dict): brown_ic = wordnet_ic.ic('ic-brown.dat') synsets1 = syn_dict[cast_no1] synsets2 = syn_dict[cast_no2] total_sim = 0.0 no_of_comparisons = 0.0 for original_syn in synsets1: for syn1 in synsets1: if len(synsets1) is not 0 and syn1 is not None: for syn2 in synsets2: if len(synsets2) is not 0 and syn2 is not None and syn1.pos()==syn2.pos() and ((syn1.pos() == "n") or (syn2.pos() == "v")): sim = syn1.lch_similarity(syn2, brown_ic) total_sim = total_sim + sim no_of_comparisons+=1 return total_sim/no_of_comparisons
def main(): brown_ic = wordnet_ic.ic('ic-brown.dat') human_sims = parseFile("input.txt") lin_sims = linSimilarities(human_sims.keys(), brown_ic) res_sims = resSimilarities(human_sims.keys(), brown_ic) #print "Initializing Model" model = None model = gensim.models.Word2Vec() model = model.load_word2vec_format(RESOURCES+'glove_model.txt', binary=False) #print "Model created calling vec Sim" vec_sims = vecSimilarities(human_sims.keys(), model) #print "AFter call to vec Sim" lin_score = 0 res_score = 0 vec_score = 0 print '{0:15} {1:15} {2:10} {3:20} {4:20} {5:20}'.format('word1','word2', 'human', 'Lin', 'Resnik', 'Word2Vec') for key, human in human_sims.items(): try: lin = lin_sims[key] except: lin = 0 lin_score += (lin - human) ** 2 try: res = res_sims[key] except: res = 0 res_score += (res - human) ** 2 try: vec = vec_sims[key] except: vec = 0 vec_score += (vec - human) ** 2 firstword=key.partition('(')[-1].rpartition(',')[0] secondword=key.partition(',')[-1].rpartition(')')[0] secondword=secondword.strip() print '{0:15} {1:15} {2:10} {3:20} {4:20} {5:20}'.format(firstword,secondword, human, lin, res, vec) num_examples = len(human_sims) print "\nMean Squared Errors" print "Lin method error: %0.2f" % (lin_score/num_examples) print "Resnick method error: %0.2f" % (res_score/num_examples) print "Vector-based method error: %0.2f" % (vec_score/num_examples)
def lexical_compare(lemma_text,lemma_hypothesis): similarity_score = 0 brown_ic = wordnet_ic.ic('ic-brown.dat') if re.search(lemma_text,lemma_hypothesis,re.M|re.I): return 50 hypo_synset = wn.synsets(lemma_hypothesis) text_synset = wn.synsets(lemma_text) synset_index = get_index(hypo_synset, text_synset) if synset_index == -1: return 0 if len(hypo_synset) > 0 and len(text_synset) > 0: similarity_score = hypo_synset[synset_index].path_similarity(text_synset[0],brown_ic) similarity_score += hypo_synset[synset_index].wup_similarity(text_synset[0],brown_ic) similarity_score += hypo_synset[synset_index].lin_similarity(text_synset[0],brown_ic) similarity_score += hypo_synset[synset_index].res_similarity(text_synset[0],brown_ic) return similarity_score
def extract_word_clusters(commentList, commentCount): brown_ic = wordnet_ic.ic('ic-brown.dat') a, corpus, global_synsets = extract_global_bag_of_words(commentList, True) similarity_dict = {} i = 0 t = len(global_synsets)**2 for syn_out in global_synsets: similarity_dict[syn_out] = {} for syn_in in global_synsets: if syn_in.pos() == syn_out.pos(): similarity_dict[syn_out][syn_in] = syn_out.lin_similarity(syn_in, brown_ic) else: similarity_dict[syn_out][syn_in] = max(wn.path_similarity(syn_out,syn_in), wn.path_similarity(syn_in,syn_out)) if i % 10000 == 0: print i, 'synsets processed out of',len(global_synsets)**2, '(',float(i)/(t),'%)' i += 1 tuples = [(i[0], i[1].values()) for i in similarity_dict.items()] vectors = [np.array(tup[1]) for tup in tuples] # Rule of thumb n = sqrt(len(global_synsets)/2) print "Number of clusters", n km_model = KMeans(n_clusters=n) km_model.fit(vectors) clustering = collections.defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(tuples[idx][0]) pprint.pprint(dict(clustering), width=1) feature_vector = np.zeros([len(corpus),n]) for i,comment in enumerate(corpus): for w in comment: for key, clust in clustering.items(): if w in clust: feature_vector[i][key] += 1 if i % 1000 == 0: print i, 'comments processed' print feature_vector '''
def wnsim(word1,word2,ps = wn.NOUN, metric='path',ic=wn_ic.ic('ic-brown.dat')): #function to calculate wn similarity of two words #maximises similarity over all senses of the given part of speech (by default noun) ss1=wn.synsets(word1,pos=ps) ss2=wn.synsets(word2,pos=ps) maxsim = 0 for s1 in ss1: for s2 in ss2: thissim = sssim(s1,s2,metric,ic) if thissim>maxsim: maxsim=thissim #print maxsim return maxsim
def computeLinSimilarity(term1, term2): global ic if not ic: #ic = wordnet_ic.ic('ic-semcor.dat') ic = wordnet_ic.ic('ic-brown.dat') w1_syns = wn.synsets(term1) w2_syns = wn.synsets(term2) maxsim = 0 for w1s in w1_syns: for w2s in w2_syns: try: sim = wn.lin_similarity(w1s, w2s, ic) if sim > maxsim: maxsim = sim except Exception: pass return maxsim
def lin_truth(): semcor_ic = wordnet_ic.ic('ic-semcor.dat') content = [word.strip() for word in open(Input2)] truth_arr = [] for i in content: similarity = [] synA = wordnet.synset(i + ".n.01") for j in content: synB = wordnet.synset(j + ".n.01") sim = synA.lin_similarity(synB, semcor_ic) similarity.append(sim) truth_arr.append(similarity) D = ss.csr_matrix(np.array(truth_arr, dtype=np.float64)) return D
def main(): brown_ic = wordnet_ic.ic('ic-brown.dat') human_sims = parseFile("input.txt") lin_sims = linSimilarities(human_sims.keys(), brown_ic) res_sims = resSimilarities(human_sims.keys(), brown_ic) model = None model = gensim.models.Word2Vec() model = model.load_word2vec_format(RESOURCES+'glove_model.txt', binary=False) vec_sims = vecSimilarities(human_sims.keys(), model) lin_score = 0 res_score = 0 vec_score = 0 print '{0:15} {1:15} {2:10} {3:20} {4:20} {5:20}'.format('word1','word2', 'human', 'Lin', 'Resnik', 'Word2Vec') for key, human in human_sims.items(): try: lin = lin_sims[key] except: lin = 0 lin_score += (lin - human) ** 2 try: res = res_sims[key] except: res = 0 res_score += (res - human) ** 2 try: vec = vec_sims[key] except: vec = 0 vec_score += (vec - human) ** 2 print '{0:15} {1:15} {2:10} {3:20} {4:20} {5:20}'.format(key[0], key[1], human, lin, res, vec) num_examples = len(human_sims) print "\nMean Squared Errors" print "Lin method error: %0.2f" % (lin_score/num_examples) print "Resnick method error: %0.2f" % (res_score/num_examples) print "Vector-based method error: %0.2f" % (vec_score/num_examples)
def main(fname): lyrics = preprocess_lyrics(fname) collection = nltk.TextCollection(nltk.corpus.brown) ic = wordnet_ic.ic('ic-brown.dat') thresh_counts = {} for similarity in SimilarityCalculator.SIMILARITIES.keys() + ['bp_adj']: scores = [] output_fname = os.path.join('output', similarity + '.txt') pickled_fname = output_fname + '.pickled' img_fname = os.path.join('output', similarity + '_hist.png') if os.path.exists(output_fname): continue now = datetime.datetime.now() print '[{}] Starting calculation on {}'.format(str(now), similarity) if similarity == 'bp': adjust_bp() if os.path.exists(pickled_fname): scores = [score for couplet, score in pickle.load(open(pickled_fname, 'r'))] else: sc = SimilarityCalculator(collection, similarity, ic) for lyric1, lyric2 in pairwise(lyrics): scores.append(sc.similarity_bidirectional(lyric1, lyric2)) thresh_counts[similarity] = print_report(open(output_fname, 'w'), scores, open(fname, 'r').read(). split('\n'), open(pickled_fname, 'w'), img_fname) now = datetime.datetime.now() print '[{}] Finished calculation on {}'.format(str(now), similarity) plt.clf() for similarity in thresh_counts.keys(): res = list(thresh_counts[similarity].iteritems()) res.sort() res = zip(*res) plt.plot(res[0], res[1], label=similarity, zorder=1) plt.scatter(res[0], res[1], zorder=2) plt.legend() plt.xlabel("threshold") plt.ylabel("no. lyrics selected") plt.savefig(os.path.join("output", "thresholds.png"))
def check_robustpca(trainCollection, testCollection, feature): ready = True # check matlab if not check_matlab(): print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'Matlab is not available or incorrectly configured.') ready = False # check if knn is available if not check_knn(trainCollection, testCollection, feature): print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'KNN is not available.') ready = False # check data files datafiles = [ os.path.join(ROOT_PATH, trainCollection, 'TextData', 'id.userid.lemmtags.txt'), os.path.join(ROOT_PATH, trainCollection, 'FeatureData', feature)] res = find_missing_files(datafiles) if res: print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'the following files or folders are missing:\n%s' % res) return False # check external dependencies try: import h5py import numpy import scipy.io import scipy.sparse from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic brown_ic = wordnet_ic.ic('ic-brown.dat') wn.morphy('cat') wn.synsets('cat', pos=wn.NOUN) except Exception, e: try: import nltk nltk.download('brown') nltk.download('wordnet') nltk.download('wordnet_ic') except Exception, e: print e ready = False
def jiang_conrath(syn_dict, cast_no): this_cast_syns = syn_dict[cast_no] brown_ic = wordnet_ic.ic('ic-brown.dat') jc_sims = OrderedDict() i = 0 while i < len(syn_dict): total_sim = 0.0 key = 'cast' + `i` no_of_comparisons = 0.0 for original_syn in this_cast_syns: if len(this_cast_syns) is not 0 and original_syn is not None: for comparison_syn in syn_dict[key]: if len(syn_dict[key]) is not 0 and comparison_syn is not None and original_syn.pos()==comparison_syn.pos() and ((original_syn.pos() == "n") or (original_syn.pos() == "v")): sim = original_syn.lch_similarity(comparison_syn, brown_ic) total_sim = total_sim + sim no_of_comparisons+=1 i+=1 if no_of_comparisons is not 0.0: jc_sims[key] = total_sim/no_of_comparisons else: jc_sims[key] = total_sim return jc_sims
def jcn(self): semcor_ic = wordnet_ic.ic('ic-semcor.dat') content_a = [word.strip() for word in open(self.wordset_a)] content_b = [word.strip() for word in open(self.wordset_b)] truth_mat = np.zeros(shape=(len(content_a), len(content_b))) x = 0 for i in content_a: y = 0 synA = wordnet.synset(i + ".n.01") for j in content_b: synB = wordnet.synset(j + ".n.01") sim = synA.jcn_similarity(synB, semcor_ic) truth_mat[x, y] = sim y += 1 x += 1 return truth_mat
def writeTranslatedWN(mapping, output, balanced=True): from nltk.corpus import wordnet_ic from nltk.corpus import wordnet as wn ic = wordnet_ic.ic("ic-bnc-resnik-add1.dat") removed = set([]) # We gave them hyponym counts, so we don't need to propagate counts o = OntologyWriter(output, propagate_counts=False) for ii in orderedTraversal(wn, pos='n', reverse_depth=True): children = [x.offset for x in ii.hyponyms() + ii.instance_hyponyms() \ if not x.offset in removed] words = mapSynsetWords(mapping, ii, balanced) if len(children) == 0 and len(words) == 0: removed.add(ii.offset) print ("%i synsets removed" % len(removed)) for ii in orderedTraversal(wn, pos='n'): children = [x.offset for x in ii.hyponyms() + ii.instance_hyponyms() \ if not x.offset in removed] hyponym_count = sum(ic['n'][x] for x in children) information_contribution = ic['n'][ii.offset] - hyponym_count words = mapSynsetWords(mapping, ii, balanced) assert information_contribution > 0.0 or information_contribution == 0.0 \ and len(ii.lemmas) == 0, "Synset %i had no information" % ii.offset if len(words) > 0: per_word_contribution = information_contribution / float(len(words)) words = [x + (per_word_contribution,) for x in words] # Add synsets if they're not vestigial leaves if len(children) > 0 or len(words) > 0: o.AddSynset(ii.offset, ii.name, children, words, hyponym_count) o.Finalize()
def wordnet_similarity(words, sim_measure, wnlabels): sims = {} brown_ic = wordnet_ic.ic('ic-brown.dat') for word in words: w = word[:word.find(":")] senses = wn.synsets(w, wn.NOUN) if len(senses) < 1: continue #TODO right_sense = wnlabels._correct_sense(senses, w) targets = [] for othword in words: if word == othword: continue othw = othword[:othword.find(":")] othw_senses = wn.synsets(othw, wn.NOUN) if len(othw_senses) < 1: continue othw_right_sense = wnlabels._correct_sense(othw_senses, othw) #print w, othw, senses[right_sense], othw_senses[othw_right_sense] if sim_measure == "jcn": sim = senses[right_sense].jcn_similarity(othw_senses[othw_right_sense], brown_ic) elif sim_measure == "wup": sim = senses[right_sense].wup_similarity(othw_senses[othw_right_sense]) elif sim_measure == "path": sim = senses[right_sense].path_similarity(othw_senses[othw_right_sense]) targets.append([othword, sim]) targets = sorted(targets, reverse=True, key=itemgetter(1)) #print word, targets sims[word] = targets return sims
def __init__(self): self.brown_ic = wordnet_ic.ic('ic-brown.dat')
def __init__(self, utterance_sep, path_output_lu_parses, path_output_parses, parser_path, cfg_rules_path, pos_tagger_path=None, path_to_freq_norms=None, path_to_image_norms=None, path_to_dictionary=None, lu_analyzer_path=None, path_to_anew=None, path_to_warringer=None, do_wnic=False, path_to_rst_python=None, path_to_rst=None, path_output_rst=None, path_to_stanford_cp=None, path_to_mpqa_lexicon=None, path_to_lda_model=None, path_to_lda_wordids=None, do_lexical=True, do_syntactic=True, do_semantic=True, do_pragmatic=False, lexical_list=None, syntactic_list=None, semantic_list=None, pragmatic_list=None): '''Parameters: source_transcript : list of strings. Full paths to directories containing transcripts (with no filler annotations) source_transcript_fillers : list of string. Full paths to a directories containing transcripts with filler annotations utterance_sep : string. The string that delimits utterance boundaries in the transcript path_lu_output_parses : string. The absolute path to a directory that will store the Lu features and parses. path_output_parses : string. The absolute path to a directory that will store the parse trees produced for the data. parser_path : string. The absolute path to a directory containing a Stanford lexparser cfg_rules_path : string. The absolute path to a file containing cfg productions to be extracted (one per line) path_output_lda_topics: string. The absolute path to the csv file where key-value topics will be stored. pos_tagger_path : optional, string. Full path to a directory containing a Stanford POS tagger path_to_freq_norms : optional, string. Full path to a file containing frequency norms path_to_image_norms : optional, string. Full path to a file containing imageability norms path_to_dictionary : optional, string. Full path to a file containing valid words for the language lu_analyzer_path : optional path_to_rst_python : optional, string. Full path to virtualenv python, for RST path_to_rst : optional, string. Full path to folder with RST's 'parse.py' path_output_rst: optional, string. Full path to where RST stores its results path_to_lda_model : string. Full path to trained LDA model. path_to_lda_wordids : string. Full path to word IDs used in trained LDA model. ''' self.utterance_sep = utterance_sep self.output_rst_dir = os.path.abspath(path_output_rst) self.output_parse_dir = os.path.abspath(path_output_parses) self.output_lu_parse_dir = os.path.abspath(path_output_lu_parses) self.pos_tagger_path = pos_tagger_path self.parser_path = parser_path self.cfg_rules_path = cfg_rules_path self.path_to_mpqa_lexicon = path_to_mpqa_lexicon self.path_to_rst_python = path_to_rst_python self.path_to_rst = path_to_rst self.path_to_stanford_cp = path_to_stanford_cp self.path_to_lda_model = path_to_lda_model self.path_to_lda_wordids = path_to_lda_wordids self.do_lexical = do_lexical self.do_syntactic = do_syntactic self.do_semantic = do_semantic self.do_pragmatic = do_pragmatic self.lexical_list = lexical_list self.syntactic_list = syntactic_list self.semantic_list = semantic_list self.pragmatic_list = pragmatic_list file_utils.ensure_dir(self.output_parse_dir) file_utils.ensure_dir(self.output_lu_parse_dir) file_utils.ensure_dir(self.output_rst_dir) # self.transcript_set = transcript.TranscriptSet(dataset=[]) # Get lexical norms if path_to_freq_norms is not None: self.norms_freq = functions.get_frequency_norms(path_to_freq_norms) else: # default self.norms_freq = functions.get_frequency_norms() if path_to_image_norms is not None: self.norms_image = functions.get_imageability_norms(path_to_image_norms) else: # default self.norms_image = functions.get_imageability_norms() if path_to_anew is not None: self.norms_anew = functions.get_anew_norms(path_to_anew) else: # default self.norms_anew = None # Warringer if path_to_warringer is not None: self.norms_warringer = functions.get_warringer_norms(path_to_warringer) else: # default self.norms_warringer = functions.get_warringer_norms() # MPQA if path_to_mpqa_lexicon is not None: [self.mpqa_words, self.mpqa_types, self.mpqa_polarities] = functions.get_mpqa_lexicon(path_to_mpqa_lexicon) else: # default [self.mpqa_words, self.mpqa_types, self.mpqa_polarities] = functions.get_mpqa_lexicon() # Set up the dictionary of valid words for the language if path_to_dictionary is not None: source_dict = path_to_dictionary else: source_dict = os.path.abspath("../feature_extraction/text/american-english") # default with open(source_dict, 'r') as fin_dict: words = fin_dict.readlines() self.dictionary_words = set(word.strip().lower() for word in words) self.prondict = cmudict.dict() if lu_analyzer_path is not None: self.lu_analyzer_path = lu_analyzer_path else: self.lu_analyzer_path = os.path.abspath('../L2SCA-2011-10-10/') # semantics if do_wnic: self.brown_ic = wnic.ic('ic-brown.dat') # FR: it would be nice to have a dat based on normative data, baby self.semcor_ic = wnic.ic('ic-semcor.dat') else: self.brown_ic = [] self.semcor_ic = []
def sim_wordnet(wordpairs, filename): brown_ic = wordnet_ic.ic('ic-brown.dat') semcor_ic = wordnet_ic.ic('ic-semcor.dat') WORDNET_DIR = os.path.join(RESULT_DIR, "wordnet") pathfile = file(os.path.join(WORDNET_DIR, "path_" + filename), 'wb') pathwriter = csv.writer(pathfile) wupfile = file(os.path.join(WORDNET_DIR, "wup_" + filename), 'wb') wupwriter = csv.writer(wupfile) lchfile = file(os.path.join(WORDNET_DIR, "lch_" + filename), 'wb') lchwriter = csv.writer(lchfile) resfile = file(os.path.join(WORDNET_DIR, "res_" + filename), 'wb') reswriter = csv.writer(resfile) jcnfile = file(os.path.join(WORDNET_DIR, "jcn_" + filename), 'wb') jcnwriter = csv.writer(jcnfile) linfile = file(os.path.join(WORDNET_DIR, "lin_" + filename), 'wb') linwriter = csv.writer(linfile) resultfiles = [pathfile, wupfile, lchfile, resfile, jcnfile, linfile] resultwriters = [ pathwriter, wupwriter, lchwriter, reswriter, jcnwriter, linwriter ] for wordpair in wordpairs: synsets1 = wordnet.synsets(wordpair[0]) synsets2 = wordnet.synsets(wordpair[1]) path_sim = -100 wup_sim = -100 lch_sim = -100 res_sim = -100 jcn_sim = -100 lin_sim = -100 for tmpword1 in synsets1: for tmpword2 in synsets2: if tmpword1.pos() == tmpword2.pos(): try: path_sim = max(path_sim, tmpword1.path_similarity(tmpword2)) except Exception, e: print tmpword1, tmpword2 print "path: " + str(e) try: wup_sim = max(wup_sim, tmpword1.wup_similarity(tmpword2)) except Exception, e: print tmpword1, tmpword2 print "wup: " + str(e) try: lch_sim = max(lch_sim, tmpword1.lch_similarity(tmpword2)) except Exception, e: print tmpword1, tmpword2 print "lch: " + str(e) try: res_sim = max( res_sim, tmpword1.res_similarity(tmpword2, brown_ic)) except Exception, e: print tmpword1, tmpword2 print "res: " + str(e) try: jcn_sim = max( jcn_sim, tmpword1.jcn_similarity(tmpword2, brown_ic)) except Exception, e: print tmpword1, tmpword2 print "jcn: " + str(e)
from nltk.corpus import wordnet_ic def wm_subjects(subjects): wm_subjects = [] for subject in subjects: wm_subjects.append(wn.synsets(subject, pos=wn.NOUN)[0]) return wm_subjects def match_subjects(wm_subjects): match_subjects = [] aux = [] for subi in wm_subjects: for subj in wm_subjects: aux.append(subi.res_similarity(subj, brown_ic)/subi.res_similarity(subi, brown_ic)) match_subjects.append(aux) aux = [] return match_subjects subjects = [] with open("subjects.txt", "rt") as fin: for line in fin: subjects.append(line.replace('\n', '')) wm_subjects = wm_subjects(subjects) brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus to compute the IC match_subjects = match_subjects(wm_subjects) print(match_subjects)
def sim_wordnet(wordpairs, filename): brown_ic = wordnet_ic.ic('ic-brown.dat') semcor_ic = wordnet_ic.ic('ic-semcor.dat') WORDNET_DIR = os.path.join(RESULT_DIR, "wordnet") pathfile = open(os.path.join(WORDNET_DIR, "path_" + filename), 'w', newline='') pathwriter = csv.writer(pathfile) wupfile = open(os.path.join(WORDNET_DIR, "wup_" + filename), 'w', newline='') wupwriter = csv.writer(wupfile) lchfile = open(os.path.join(WORDNET_DIR, "lch_" + filename), 'w', newline='') lchwriter = csv.writer(lchfile) resfile = open(os.path.join(WORDNET_DIR, "res_" + filename), 'w', newline='') reswriter = csv.writer(resfile) jcnfile = open(os.path.join(WORDNET_DIR, "jcn_" + filename), 'w', newline='') jcnwriter = csv.writer(jcnfile) linfile = open(os.path.join(WORDNET_DIR, "lin_" + filename), 'w', newline='') linwriter = csv.writer(linfile) resultfiles = [pathfile, wupfile, lchfile, resfile, jcnfile, linfile] resultwriters = [ pathwriter, wupwriter, lchwriter, reswriter, jcnwriter, linwriter ] for wordpair in wordpairs: synsets1 = wordnet.synsets(wordpair[0]) synsets2 = wordnet.synsets(wordpair[1]) path_sim = -100 wup_sim = -100 lch_sim = -100 res_sim = -100 jcn_sim = -100 lin_sim = -100 for tmpword1 in synsets1: for tmpword2 in synsets2: if tmpword1.pos() == tmpword2.pos(): try: path_sim = max(path_sim, tmpword1.path_similarity(tmpword2)) except Exception as e: print(tmpword1, tmpword2) print("path: " + str(e)) try: wup_sim = max(wup_sim, tmpword1.wup_similarity(tmpword2)) except Exception as e: print(tmpword1, tmpword2) print("wup: " + str(e)) try: lch_sim = max(lch_sim, tmpword1.lch_similarity(tmpword2)) except Exception as e: print(tmpword1, tmpword2) print("lch: " + str(e)) try: res_sim = max( res_sim, tmpword1.res_similarity(tmpword2, brown_ic)) except Exception as e: print(tmpword1, tmpword2) print("res: " + str(e)) try: jcn_sim = max( jcn_sim, tmpword1.jcn_similarity(tmpword2, brown_ic)) except Exception as e: print(tmpword1, tmpword2) print("jcn: " + str(e)) try: lin_sim = max( lin_sim, tmpword1.lin_similarity(tmpword2, semcor_ic)) except Exception as e: print(tmpword1, tmpword2) print("lin: " + str(e)) path_result = (wordpair[0], wordpair[1], path_sim) wup_result = (wordpair[0], wordpair[1], wup_sim) lch_result = (wordpair[0], wordpair[1], lch_sim) res_result = (wordpair[0], wordpair[1], res_sim) jcn_result = (wordpair[0], wordpair[1], jcn_sim) lin_result = (wordpair[0], wordpair[1], lin_sim) results = [ path_result, wup_result, lch_result, res_result, jcn_result, lin_result ] for i in range(len(resultwriters)): writer = resultwriters[i] writer.writerow(results[i]) for resultfile in resultfiles: resultfile.close()
def __init__(self): parser = argparse.ArgumentParser( description="Run the Codenames AI competition game.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("codemaster", help="Path to codemaster package or 'human'") parser.add_argument("guesser", help="Path to guesser package or 'human'") parser.add_argument( "--seed", help="Random seed value for board state -- integer or 'time'", default='time') parser.add_argument("--w2v", help="Path to w2v file or 'none'", default='none') parser.add_argument("--glove_cm", help="Path to glove file or 'none'", default='none') parser.add_argument("--glove_guesser", help="Path to glove file or 'none'", default='none') parser.add_argument( "--wordnet", help="Name of wordnet file or 'none', most like ic-brown.dat", default='none') args = parser.parse_args() # if the game is going to have an ai, load up word vectors if sys.argv[1] != "human" or sys.argv[2] != "human": brown_ic = None if args.wordnet != 'none': brown_ic = wordnet_ic.ic(args.wordnet) glove_vecs_cm = {} if args.glove_cm != 'none': with open(args.glove_cm, encoding="utf-8") as infile: for line in infile: line = line.rstrip().split(' ') glove_vecs_cm[line[0]] = np.array( [float(n) for n in line[1:]]) print('loaded glove cm vectors') glove_vecs_guesser = {} if args.glove_guesser != 'none': with open(args.glove_guesser, encoding="utf-8") as infile: for line in infile: line = line.rstrip().split(' ') glove_vecs_guesser[line[0]] = np.array( [float(n) for n in line[1:]]) print('loaded glove guesser vectors') word_vectors = {} if args.w2v != 'none': word_vectors = word2vec.KeyedVectors.load_word2vec_format( args.w2v, binary=True, unicode_errors='ignore') print('loaded word vectors') if args.codemaster == "human": self.codemaster = human_codemaster() print('human codemaster') else: codemaster_module = importlib.import_module(args.codemaster) self.codemaster = codemaster_module.ai_codemaster( brown_ic, glove_vecs_cm, word_vectors) print('loaded codemaster') if args.guesser == "human": self.guesser = human_guesser() print('human guesser') else: guesser_module = importlib.import_module(args.guesser) self.guesser = guesser_module.ai_guesser(brown_ic, glove_vecs_guesser, word_vectors) print('loaded guesser') self.seed = 'time' if args.seed != 'time': self.seed = args.seed random.seed(int(args.seed)) f = open("game_wordpool.txt", "r") if f.mode == 'r': temp_array = f.read().splitlines() self.words = set([]) # if duplicates were detected and the set length is not 25 then restart while len(self.words) != 25: self.words = set([]) for x in range(0, 25): random.shuffle(temp_array) self.words.add(temp_array.pop()) self.words = list(sorted(self.words)) random.shuffle(self.words) self.maps = ["Red"] * 8 + ["Blue"] * 7 + ["Civilian"] * 9 + [ "Assassin" ] random.shuffle(self.maps)
def calculate_sim_matrix_from_list(word_list, methods_list, word_pos='n', full_synsets=False, all_matrix=False): print('calculate_sim_matrix_from_list started') content_dict = {} noun_to_noun_sim_matrices = {} unknown_words = {} word_list_size = len(word_list) for method in methods_list: noun_to_noun_sim_matrices[method] = np.add( np.zeros((word_list_size, word_list_size), dtype=float), 0.001) noun_to_noun_sim_matrices['average_of_methods'] = np.add( np.zeros((word_list_size, word_list_size), dtype=float), 0.001) brown_ic = wordnet_ic.ic('ic-brown.dat') i = 0 if all_matrix: bigger_loop_limit = word_list_size else: bigger_loop_limit = (word_list_size - 1) while i < bigger_loop_limit: if all_matrix: j = 0 else: j = i + 1 if full_synsets: w1 = wordnet.synset(word_list[i]) else: w1 = wordnet.synsets(word_list[i], word_pos) if not w1: print('Not able to find this noun: ' + word_list[i]) unknown_words[word_list[i]] = False i += 1 continue w1 = w1[0] while j < word_list_size: if full_synsets: w2 = wordnet.synset(word_list[j]) else: w2 = wordnet.synsets(word_list[j], word_pos) if not w2: j += 1 continue w2 = w2[0] if 'wup' in noun_to_noun_sim_matrices: value = w1.wup_similarity(w2) value = utils.limit_value(value, 0.001, 1.0) noun_to_noun_sim_matrices['wup'][i][j] = value if 'jcn' in noun_to_noun_sim_matrices: value = w1.jcn_similarity(w2, brown_ic) value = utils.limit_value(value, 0.001, 1.0, True) noun_to_noun_sim_matrices['jcn'][i][j] = value if 'lin' in noun_to_noun_sim_matrices: value = w1.lin_similarity(w2, brown_ic) value = utils.limit_value(value, 0.001, 1.0) noun_to_noun_sim_matrices['lin'][i][j] = value if 'lch' in noun_to_noun_sim_matrices: value = w1.lch_similarity(w2) if word_pos == 'n': value = value / 3.6375861597263857 else: value = value / 3.258096538021482 value = utils.limit_value(value, 0.001, 1.0) noun_to_noun_sim_matrices['lch'][i][j] = value value = 0.0 for method in methods_list: value += noun_to_noun_sim_matrices[method][i][j] value = value / len(methods_list) value = utils.limit_value(value, 0.001, 1.0) noun_to_noun_sim_matrices['average_of_methods'][i][j] = value j += 1 i += 1 print('calculate_sim_matrix_from_list: ' + str(i) + '/' + str(word_list_size - 1)) print('calculate_sim_matrix_from_list ended') content_dict['noun_to_noun_sim_matrices'] = noun_to_noun_sim_matrices content_dict['unknown_words'] = unknown_words return content_dict
return None return similarity if __name__ == '__main__': method = sys.argv[1] # jcn or lch corpus = sys.argv[2] # semcor or brown preserve = False # Preserve SimLex similarities? if len(sys.argv) > 3: preserve = True maxval = 1000.0 # This value will be assigned to extremely high-similarity pairs (like 1e+300) ic = wordnet_ic.ic('ic-%s.dat' % corpus) for line in sys.stdin: if line.strip().startswith('#'): continue res = line.strip().split('\t') (word0, word1, simlex_sim) = res simlex_sim = float(simlex_sim) synsets0 = reversed(wn.synsets(word0.strip(), 'n')) synsets1 = reversed(wn.synsets(word1.strip(), 'n')) best_pair = None best_sim = 0.0 for pair in product(synsets0, synsets1): if pair[0] == pair[1]: continue wordnet_sim = calc_similarity(pair, method, ic)
import matplotlib.pyplot as plt import scipy from nltk.corpus import wordnet as wn, wordnet_ic as wn_ic, lin_thesaurus as lin brown_ic = wn_ic.ic("ic-brown.dat") def noun_path_similarity(noun_1, noun_2): """ Returns path similarity of two nouns :param noun_1: :param noun_2: :return: path similarity """ synsets_1 = wn.synsets(noun_1, wn.NOUN) synsets_2 = wn.synsets(noun_2, wn.NOUN) return round( max([ synset_1.path_similarity(synset_2) for synset_1 in synsets_1 for synset_2 in synsets_2 ]), 4) def noun_similarity(noun_1, noun_2, sim_measure=None): """ Returns similarity between two nouns using the similarity measure defined. :param noun_1: :param noun_2: :param sim_measure: 'path_similarity', 'res_similarity', 'lin_similarity' :return: similarity measure """
def main(): " Function to write calling of all the above functions " str_dir_path = os.getcwd() # Question 1.1 1 directory = "/home1/c/cis530/hw4/dev_input" process_ts_file(directory) print "1.1 1 Computed" # Question 1.1 2 topic_file = str_dir_path + "/ts_files/" + "aakritis_dev_00.ts" n = 20 tup_list = load_topic_words(topic_file, n) print "1.1 2 Computed" # Question 1.2 # to get list of all .ts file ts_directory = str_dir_path + "/ts_files" list_all_files = get_all_files(ts_directory) n = 20 text_directory = str_dir_path + "/expanded_topic_words_files" if not os.path.exists(text_directory): os.makedirs(text_directory) # computing information content ( ic ) brown_ic = wordnet_ic.ic('ic-brown.dat') for each_file in list_all_files: # run load_topic_words to get key list and candidate list tup_list = load_topic_words(each_file, n) # Question 1.1 2 keylist = tup_list[0] candidatelist = tup_list[1] # to extract file name list_name = each_file.split("/") len_list = len(list_name) name_ext = list_name[len_list - 1] name_split = name_ext.split(".", 1) name = name_split[0] + ".txt" outputfile = str_dir_path + "/expanded_topic_words_files/" + name expand_keywords(keylist, candidatelist, brown_ic, outputfile) print name + " Written" print "1.2 Computed" # Question 2.1 directory = "/home1/c/cis530/hw4/dev_input" list_sub_dirs = get_list_subdirs(directory) sum_directory = str_dir_path + "/summarize_baseline" if not os.path.exists(sum_directory): os.makedirs(sum_directory) for each_subdir in list_sub_dirs: # extract dir name from dir path list_name = each_subdir.split("/") len_list = len(list_name) name = "sum_" + list_name[len_list - 1] + ".txt" outputfile = str_dir_path + "/summarize_baseline/" + name summarize_baseline(each_subdir, outputfile) print name + " Written" print "2.1 Computed" # Question 2.2 directory = "/home1/c/cis530/hw4/dev_input" list_sub_dirs = get_list_subdirs(directory) sum_directory = str_dir_path + "/summarize_kl" if not os.path.exists(sum_directory): os.makedirs(sum_directory) for each_subdir in list_sub_dirs: # extract dir name from dir path list_name = each_subdir.split("/") len_list = len(list_name) name = "sum_" + list_name[len_list - 1] + ".txt" outputfile = str_dir_path + "/summarize_kl/" + name summarize_kl(each_subdir, outputfile) print name + " Written" print "2.2 Computed" # Question 2.3 result_file = str_dir_path + "/results.txt" write_rouge_results(result_file) print "2.3 Computed" return
def __init__(self, ic_corpus='brown'): self._ic_corpus = wordnet_ic.ic('ic-brown.dat') if ic_corpus == 'brown' else wordnet_ic.ic('ic-semcor.dat') self._wn_max_depth = 19 self._default_metrics = ['path','lch','wup','li','res','lin','jcn','wpath','zhou'] self._wn_lemma = WordNetLemmatizer()
import sys import json import jsonrpclib from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic import pickle brown_ic = wordnet_ic.ic('ic-brown.dat') import unitConversion as uc class aset: def __init__(self, num=None, entity=None, surface=None, idx=None): self.num = num self.entity = entity self.surface = surface self.idx = idx self.widx = (idx % 1000) + 1 if idx is not None else None self.container = None self.verbs = None self.adjs = None self.location = None self.contains = None self.compound = 0 self.subtypes = [] self.type_failure = 0 def details(self, sf=True): string = "_____________\n" ordrd = sorted(self.__dict__.items()) for x, y in ordrd:
def compress_isa_graph(self, verbose=True): """ This function is used to compress the extracted graph from WordNet by removing some of the nodes. The compression strategy follows paper 'Nearly-Automated Metadata Hierarchy Creation' :param verbose: whether to show compression steps for debugging :return: """ print("\n\nCompressing WordNet object hierarchy...") graph1 = copy.deepcopy(self.graph) # Rule 1 - Remove all nodes with low information content brown = wnic.ic('ic-brown.dat') for node in list(self.graph.nodes()): if self.graph.nodes[node]["type"] != "object_id" and self.graph.nodes[node]["type"] != "wordnet_synset": if rwn.information_content(wn.synset(node), brown) < 3.0: self.graph.remove_node(node) if verbose: diff = set(graph1.nodes()) - set(self.graph.nodes()) print("Nodes removed by compression rule 1: {}".format(list(diff))) # Rule 2 - Remove all nodes with only a single child except the root if verbose: graph2 = copy.deepcopy(self.graph) # starting from leaf nodes nodes_sort = [node for node in self.graph if len( list(self.graph.predecessors(node))) == 0] while len(nodes_sort) > 0: node = nodes_sort.pop(0) if node not in self.graph: continue parents = list(self.graph.successors(node)) children = list(self.graph.predecessors(node)) for parent in parents: nodes_sort.append(parent) if len(children) == 1 and len( parents) != 0 and self.graph.nodes[node]["type"] != "object_id" and self.graph.nodes[node]["type"] != "wordnet_synset": self.graph.remove_node(node) for parent in parents: for child in children: self.graph.add_edge(child, parent, relation='IsA') if verbose: diff = set(graph2.nodes()) - set(self.graph.nodes()) print("Nodes removed by compression rule 2: {}".format(list(diff))) # Rule 3 - Remove all nodes whose name contains the name of the parent # (except seed) if verbose: graph3 = copy.deepcopy(self.graph) for node in list(self.graph.nodes()): if len(list(self.graph.predecessors(node))) == 0: continue if self.graph.nodes[node]["type"] == "object_id" or self.graph.nodes[node]["type"] == "wordnet_synset": continue parents = list(self.graph.successors(node)) children = list(self.graph.predecessors(node)) should_remove = True if len(parents) > 0 else False for parent in parents: pname = parent.split('.')[0] cname = node.split('.')[0] if pname not in cname: should_remove = False break if should_remove: self.graph.remove_node(node) for child in children: for parent in parents: self.graph.add_edge(child, parent, relation='IsA') if verbose: diff = set(graph3.nodes()) - set(self.graph.nodes()) print("Nodes removed by compression rule 3: {}".format(list(diff))) # sanity check: make sure no initial object nodes are removed current_seeds = [] for n in list(graph1.nodes()): if graph1.nodes[n]["type"] == "wordnet_synset" or graph1.nodes[n]["type"] == "object_id": assert n in self.graph.nodes # add a common parent to combine the isolated graphs created by # compression root_nodes = [ (node, "entity.n.01") for node in self.graph if len( list( self.graph.successors(node))) == 0] self.graph.add_node( "entity.n.01", color="orange", type="extracted_wordnet_synset") self.graph.add_edges_from(root_nodes, relation="IsA")
def computeInfContSimilarity(): ## Load an information content file from the wordnet_ic corpus brown_ic = wordnet_ic.ic('ic-brown.dat') print "computing Information Content Similarity..." tStart = time.time() ## Compute the similarity between nouns ALLnouns_sim = [] for subSent1, subSent2 in zip(nouns_text1, nouns_text2): ## if-else to use the longer sentence if (len(subSent1) > len(subSent2)): nounSim = np.zeros(len(subSent1)) for i, noun1 in enumerate(subSent1): for noun2 in subSent2: try: w1 = noun1 + ".n.01" w1 = wn.synset(w1) w2 = noun2 + ".n.01" w2 = wn.synset(w2) sim = wn.jcn_similarity(w1, w2, brown_ic) if sim > nounSim[i]: nounSim[i] = sim except: continue # print nounSim else: nounSim = np.zeros(len(subSent2)) for i, noun2 in enumerate(subSent2): for noun1 in subSent1: try: w1 = noun1 + ".n.01" w1 = wn.synset(w1) w2 = noun2 + ".n.01" w2 = wn.synset(w2) sim = wn.jcn_similarity(w1, w2, brown_ic) if sim > nounSim[i]: nounSim[i] = sim except: continue ALLnouns_sim.append(nounSim) ## Compute the similarity between verbs ALLverbs_sim = [] for subSent1, subSent2 in zip(verbs_text1, verbs_text2): ## if-else to use the longer sentence if (len(subSent1) > len(subSent2)): verbSim = np.zeros(len(subSent1)) for i, verb1 in enumerate(subSent1): for verb2 in subSent2: try: w1 = verb1 + ".n.01" w1 = wn.synset(w1) w2 = verb2 + ".n.01" w2 = wn.synset(w2) sim = wn.jcn_similarity(w1, w2, brown_ic) if sim > verbSim[i]: verbSim[i] = sim except: continue else: verbSim = np.zeros(len(subSent2)) for i, verb2 in enumerate(subSent2): for verb1 in subSent1: try: w1 = verb1 + ".n.01" w1 = wn.synset(w1) w2 = verb2 + ".n.01" w2 = wn.synset(w2) sim = wn.jcn_similarity(w1, w2, brown_ic) if sim > verbSim[i]: verbSim[i] = sim except: continue ALLverbs_sim.append(verbSim) ## Compute the similarity between adjectives ALLadjs_sim = [] for subSent1, subSent2 in zip(adj_text1, adj_text2): ## if-else to use the longer sentence if (len(subSent1) > len(subSent2)): adjSim = np.zeros(len(subSent1)) for i, adj1 in enumerate(subSent1): for adj2 in subSent2: try: w1 = adj1 + ".n.01" w1 = wn.synset(w1) w2 = adj2 + ".n.01" w2 = wn.synset(w2) sim = wn.jcn_similarity(w1, w2, brown_ic) if sim > adjSim[i]: adjSim[i] = sim except: continue # print nounSim else: adjSim = np.zeros(len(subSent2)) for i, adj2 in enumerate(subSent2): for adj1 in subSent1: try: w1 = adj1 + ".n.01" w1 = wn.synset(w1) w2 = adj2 + ".n.01" w2 = wn.synset(w2) sim = wn.jcn_similarity(w1, w2, brown_ic) if sim > adjSim[i]: adjSim[i] = sim except: continue ALLadjs_sim.append(adjSim) tEnd = time.time() print "..done. Time taken (InformationContentSimilarity): ", tEnd-tStart return ALLnouns_sim, ALLverbs_sim, ALLverbs_sim
args = parser.parse_args() wn_embedding_fpath = args.model threshold = args.threshold dataset = args.test_set senseval_fpath = '../data/senseval/' + dataset + '/' + dataset + '.data.xml' gold_tags_fpath = '../data/senseval/' + dataset + '/' + dataset + '.gold.key.txt' AVG_METHOD = args.averaging VECTORIZED_SIMILARITY = args.vectorized USE_POS_INFO = args.pos MAX_DEPTH = args.depth USE_RANDOM = args.random USE_JCN = True # if False, lch is used USE_PAGERANK = False info_content = wordnet_ic.ic('ic-semcor.dat') ids, sents, poslist = load_senseval_data(senseval_fpath) disambiguated = sentence_wsd(ids, sents, poslist) # load the gold results with codecs.open(gold_tags_fpath, 'r', 'utf-8') as f: lines = f.readlines() wsd_output = [] gold_output = [] for line in lines: id_key_pair = line.split() predicted_keys = disambiguated[id_key_pair[0]].split(';') gold_keys_set = set(id_key_pair[1:]) predicted_keys_set = set(predicted_keys) if len(predicted_keys_set.intersection(gold_keys_set)) > 0: wsd_output.append(predicted_keys[0])
import nltk from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic threshold = 0.6 #treshold for wup jcnTreshold = 0.09 #jcn pathTeshold = 0.1 #path brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus lexical_chains = [] #empty list to hold all the chains dictionary = {} #empty dictionart to hold the count of each word encountered #class Chain class Chain(): def __init__(self, words, senses, count=0): self.words = set(words) self.senses = set(senses) dictionary[words[0]] = 1 #initialize counter def addWord(self, word): if (len(self.words.intersection([word])) > 0): dictionary[word] += 1 else: dictionary[word] = 1 self.words.add(word) def addSense(self, sense): self.senses.add(sense)
# Make sure the necessary corpora are downloaded to the local drive for token in ("wordnet", "wordnet_ic", "sentiwordnet"): try: nltk.data.find("corpora/" + token) except LookupError: try: nltk.download(token, quiet = True, raise_on_error = True) except ValueError: # Sometimes there are problems with the default index.xml URL. Then we will try this... from nltk.downloader import Downloader as NLTKDownloader d = NLTKDownloader("http://nltk.github.com/nltk_data/") d.download(token, quiet = True, raise_on_error = True) # Use the Brown corpus for calculating information content (IC) brown_ic = wn_ic.ic('ic-brown.dat') IC_CORPUS, IC_MAX = brown_ic, {} for key in IC_CORPUS: IC_MAX[key] = max(IC_CORPUS[key].values()) # This will hold the WordNet version VERSION = wn.get_version() or "3.0" #--------------------------------------------------------------------------------------------------- DIACRITICS = { "a": ("á", "ä", "â", "à", "å"), "e": ("é", "ë", "ê", "è"), "i": ("í", "ï", "î", "ì"), "o": ("ó", "ö", "ô", "ò", "ō", "ø"), "u": ("ú", "ü", "û", "ù", "ů"),
(probe, c, res_sim[0])) # creates a vector of all hypernyms probe_sense_vec = wn.synsets(probe) # Since these values correspond they have to be the # same size prob_senses = [0 for i in probe_sense_vec] # for each sense of the probe for sense in range(len(probe_sense_vec)): # look at all the most informative senses for mi in range(0, len(mis_vec)): hyper = lambda s: s.hypernyms() ancestors = set(probe_sense_vec[sense].closure(hyper)) # if an MIS is an ancestor of the probe if mis_vec[mi] in ancestors: # increment the probability by the MIS val prob_senses[sense] += sim_vec[mi] index = returnMaxIndex(prob_senses) try: #print(probe_sense_vec) o.write('%s PREFERRED SENSE: %s\n' % (line_num, probe_sense_vec[index])) pass except IndexError: o.write('ERROR: NO SIMILARITY\n') brown_ic = wordnet_ic.ic( '/home/jake/nltk_data/corpora/wordnet_ic/ic-brown-add1.dat') wsd(sys.argv[1], sys.argv[2], sys.argv[3])
def LexicalChain(fileName="amazon.txt", verbose=0): def findWholeWord(w): return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search #class Chain class Chain(): def __init__(self, words, senses, count=0): self.words = set(words) self.senses = set(senses) dictionary[words[0]] = 1 #initialize counter def addWord(self, word): if (len(self.words.intersection([word])) > 0): dictionary[word] += 1 else: dictionary[word] = 1 self.words.add(word) def addSense(self, sense): self.senses.add(sense) def getWords(self): return self.words def getSenses(self): return self.getSenses def incCount(self): self.count += 1 def setScore(self, sc): self.score = sc def mfword(self): maxfreq = 0 for word in self.getWords(): if dictionary[word] > maxfreq: maxword = word maxfreq = dictionary[word] return maxword def add_word(word): maximum = 0 maxJCN = 0 flag = 0 for chain in lexical_chains: #for all chains that are present for synset in wn.synsets(word): #for all synsets of current word for sense in chain.senses: #for all senses of the current word in current element of the current chain similarity = sense.wup_similarity( synset) #using wup_similarity if (similarity >= maximum): if similarity >= threshold: #print word, synset, sense, sense.jcn_similarity(synset, brown_ic) JCN = sense.jcn_similarity( synset, brown_ic) #using jcn_similarity if JCN >= jcnTreshold: if sense.path_similarity( synset) >= 0.2: #using path similarity if JCN >= maxJCN: maximum = similarity maxJCN = JCN maxChain = chain flag = 1 if flag == 1: maxChain.addWord(word) maxChain.addSense(synset) return lexical_chains.append(Chain([word], wn.synsets(word))) def count_words(summary): count = 0 for line in summary: count = count + len(line.split(' ')) return count #fileName = raw_input("Enter file path + name, if file name is 'nlp.txt', type 'nlp' \n \n") #n = raw_input("Enter number of sentences in summary.\n") #fileName = "nlp.txt" threshold = 0.6 #treshold for wup jcnTreshold = 0.09 #jcn pathTeshold = 0.1 #path brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus lexical_chains = [] #empty list to hold all the chains dictionary = { } #empty dictionart to hold the count of each word encountered word_count = 50 File = open(fileName) #open file lines = File.read() #read all lines #dec_lines = [line.decode('utf-8') for line in lines] #print [clean_line.token for clean_line in clean_lines] clean_lines = clean(lines) line_list = [clean_line.text for clean_line in clean_lines] is_noun = lambda x: True if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS') else False nouns = [ word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(lines)) if is_noun(pos) ] #extract all nouns for word in nouns: add_word(word) #print all chains for chain in lexical_chains: chain_length = 0 dis_word = 0 for word in chain.getWords(): #print str(word + "(" + str(dictionary[word]) + ")") + ',', chain_length = chain_length + dictionary[word] dis_word = dis_word + 1 #print 'Length =' + str(chain_length) hom = 1 - (dis_word * 1.0 / chain_length) #print 'Homogeneity =' + str(hom) score = 1.0 * chain_length * hom #print 'Score =' + str(score) chain.setScore(score) #print 'Sorted start ' lexical_chains.sort(key=lambda x: x.score, reverse=True) verbose = 1 if verbose == 1: for chain in lexical_chains: if (chain.score > 0.0): for word in chain.getWords(): print str(word + "(" + str(dictionary[word]) + ")") + ',', print 'Score=' + str(chain.score) summary = [] line_flags = [] line_score = [] for line in line_list: line_flags.append(0) line_score.append(0) for chain in lexical_chains: bigword = chain.mfword() chain_score = chain.score #print '\nMF word ', bigword for i in range(len(line_list)): line = line_list[i] try: x = findWholeWord(bigword)(line) except: #print 'Exception : Error in finding word' x = None if x != None: #((line.find(' '+str(bigword)+' ')!=-1) or (line.find(' '+str(bigword)+'.')!=-1)): if line_flags[i] == 0: #summary.append(line) #print 'i ', count_words(summary) line_flags[i] = 1 line_score[i] = chain_score #print 'line_score ', line_score #print 'line_flags ', line_flags break #elif line_flags[i]==1: #line_score[i] = line_score[i] + chain.score #print '\nline_score ', line_score #print 'line_flags ', line_flags ''' if(count_words(summary)>word_count): break ''' bias = 20 tot_score = 0 for i in range(len(line_score)): line_score[i] = (line_score[i] * bias) + 1 for score in line_score: tot_score = tot_score + score for i in range(len(line_score)): line_score[i] = (line_score[i] / tot_score) print line_score namscores = dict( zip([sentence.token for sentence in clean_lines], line_score)) #print namscores #print len(summary) #print line_score #final_summary = ' '.join(summary) #print final_summary return namscores #print LexicalChain(verbose=1)
def calculate_sim_matrix(self): print('calculate_sim_matrix started') self.noun_to_noun_sim_matrices.append( np.add( np.zeros((self.noun_rows_size, self.noun_rows_size), dtype=float), 0.01)) self.noun_to_noun_sim_matrices.append( np.add( np.zeros((self.noun_rows_size, self.noun_rows_size), dtype=float), 0.01)) self.noun_to_noun_sim_matrices.append( np.add( np.zeros((self.noun_rows_size, self.noun_rows_size), dtype=float), 0.01)) self.noun_to_noun_sim_matrices.append( np.add( np.zeros((self.noun_rows_size, self.noun_rows_size), dtype=float), 0.01)) self.noun_to_noun_sim_matrices.append( np.add( np.zeros((self.noun_rows_size, self.noun_rows_size), dtype=float), 0.01)) inverted_noun_dict = utils.invert_dictionary(self.noun_rows) brown_ic = wordnet_ic.ic('ic-brown.dat') for key in inverted_noun_dict: print(str(key) + ': ' + inverted_noun_dict[key]) i = 0 while i < (self.noun_rows_size - 1): j = i + 1 w1 = wordnet.synsets(inverted_noun_dict[i], pos=wordnet.NOUN) if not w1: print('Not able to find this noun: ' + inverted_noun_dict[i]) i += 1 continue w1 = w1[0] while j < self.noun_rows_size: w2 = wordnet.synsets(inverted_noun_dict[j], pos=wordnet.NOUN) if not w2: j += 1 continue w2 = w2[0] value = w1.wup_similarity(w2) value = utils.limit_value(value, 0.01, 1.0) self.noun_to_noun_sim_matrices[0][i][j] = value value = w1.lch_similarity(w2) / lch_maximum_obtained_value value = utils.limit_value(value, 0.01, 1.0) self.noun_to_noun_sim_matrices[1][i][j] = value value = w1.jcn_similarity(w2, brown_ic) value = utils.limit_value(value, 0.01, 1.0, True) self.noun_to_noun_sim_matrices[2][i][j] = value value = w1.lin_similarity(w2, brown_ic) value = utils.limit_value(value, 0.01, 1.0) self.noun_to_noun_sim_matrices[3][i][j] = value value = (self.noun_to_noun_sim_matrices[0][i][j] + self.noun_to_noun_sim_matrices[1][i][j] + self.noun_to_noun_sim_matrices[2][i][j] + self.noun_to_noun_sim_matrices[3][i][j]) / 4.0 value = utils.limit_value(value, 0.01, 1.0) self.noun_to_noun_sim_matrices[4][i][j] = value j += 1 print('sim_matrix: ' + str(i) + '\n') i += 1 print('calculate_sim_matrix ended')
#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic comb1 = "LOGA-IGFF-COSN" comb2 = "FREQ-IDFB-COSN" minRatio = 0.25 semcor_ic = wordnet_ic.ic('ic-semcor.dat') ''' Given the similarities between several pairs of words, it selects the highest similarity values for all the words corresponding to the dataset that has less relevant words After that, it assign a percentage of importance depending if the similarity is total or not, and based on the similarities and the percentages, it calculates a unique final value ''' def processSimilarities(similarities): values = [] valuesEqualsToOne = 0 keys1 = len(similarities.keys()) if keys1 > 0: keys2 = len(similarities[list(similarities.keys())[0]].keys()) else: keys2 = 0
def __init__(self, ic_path: str = "ic-brown.dat"): self.brown_ic = wordnet_ic.ic(ic_path)
def similarityWordNet(word1, word2): """ Similarity Similarity between two words with nltk Input: word1, word2 (String) Return: similarity (float) """ #print Word(word1).lemmatize() #print Word(word2).lemmatize() word1 = wn.synset(str(word1) + '.n.01') word2 = wn.synset(str(word2) + '.n.01') """ Return a score denoting how similar two word senses are, based on the shortest path that connects the senses in the is-a (hypernym/hypnoym) taxonomy. The score is in the range 0 to 1. By default, there is now a fake root node added to verbs so for cases where previously a path could not be found---and None was returned---it should return a value. The old behavior can be achieved by setting simulate_root to be False. A score of 1 represents identity i.e. comparing a sense with itself will return 1. """ #similarity1 = word1.path_similarity(word2) similarity1 = wn.path_similarity(word1, word2) """ Leacock-Chodorow Similarity: Return a score denoting how similar two word senses are, based on the shortest path that connects the senses (as above) and the maximum depth of the taxonomy in which the senses occur. range 3.6 The relationship is given as -log(p/2d) where p is the shortest path length and d the taxonomy depth. """ similarity2 = wn.lch_similarity(word1, word2) """ Wu-Palmer Similarity: Return a score denoting how similar two word senses are, based on the depth of the two senses in the taxonomy and that of their Least Common Subsumer (most specific ancestor node). range 0.92 Note that at this time the scores given do _not_ always agree with those given by Pedersen's Perl implementation of Wordnet Similarity. The LCS does not necessarily feature in the shortest path connecting the two senses, as it is by definition the common ancestor deepest in the taxonomy, not closest to the two senses. Typically, however, it will so feature. Where multiple candidates for the LCS exist, that whose shortest path to the root node is the longest will be selected. Where the LCS has multiple paths to the root, the longer path is used for the purposes of the calculation. """ similarity3 = wn.wup_similarity(word1, word2) """ Resnik Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node). Note that for any similarity measure that uses information content, the result is dependent on the corpus used to generate the information content and the specifics of how the information content was created. 0-8.43 """ brown_ic = wordnet_ic.ic('ic-brown.dat') similarity4 = word1.res_similarity(word2, brown_ic) print("similarity1: ", similarity1) print("similarity2 Leacock-Chodorow: ", similarity2) print("similarity3 Wu-Palmer: ", similarity3) print("similarity4 Resnik: ", similarity4)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger(__name__) # algorithm parameters USE_POS_INFO = True USE_JCN = True # if False, lch is used VECTORIZED_SIMILARITY = True USE_PAGERANK = False AVG_METHOD = 'micro' MAX_DEPTH = 3 senseval_fpath = 'data/senseval/senseval2/senseval2.data.xml' gold_tags_fpath = 'data/senseval/senseval2/senseval2.gold.key.txt' wn_embedding_fpath = sys.argv[1] info_content = wordnet_ic.ic('ic-brown.dat') def load_fse(path): model = {} for f_line in gensim.utils.smart_open(path): f_line = gensim.utils.to_unicode(f_line) res = f_line.strip().split('\t') (synset, vector) = res model[synset] = vector return model def hamming_distance(pair, vec_dic): s0 = vec_dic[pair[0]] s1 = vec_dic[pair[1]]
if __name__ == "__main__": if (len(sys.argv) >= 2): information_content_file_type = sys.argv[1] wsd_test_filename = sys.argv[2] judgment_file = sys.argv[3] output_filename = sys.argv[4] else: print("Incorrect number of arguments") start = time.clock() #select the right ic file based on the input parameters if information_content_file_type == "nltk": wnic = wordnet_ic.ic('ic-brown-resnik-add1.dat') else: wnic = create_ic(wsd_test_filename, judgment_file, 'hw8_myic.txt') with open(output_filename, 'w') as op_file: #creating a list to store the answers obtained by the algorithm wsd_answers_obtained = [] with open(wsd_test_filename, 'r') as wsd_file: for line in wsd_file: line = line.strip('\n') line = line.split('\t') probe_word = line[0] #extract probe word noun_groups = line[1].split(',') #estract noun groups
class WordnetSimilarityEvaluator: brown_ic = wordnet_ic.ic('ic-brown.dat') def __init__(self ): # self.fn_docs = prior_case_directory # self.file_contents = dict() self.preprocessor = Preprocessor() # self.populate_file_contents() # def populate_file_contents(self): # # self.file_contents[self.fn_docs] = dict() # for file in sorted(os.listdir(self.fn_docs), # key=lambda item: (int(item.partition('_')[2]) # if item[0].isdigit() else float('inf'), item)): # filename = os.fsdecode(file) # if filename.endswith(".txt"): # # print(os.path.join(directory), str(filename)) # with open(os.path.join(self.fn_docs, str(filename)), 'r') as f: # content = self.preprocessor.preprocess(f.read().lower()) # self.file_contents[filename] = self.doc_to_synsets(content) def convert_tag(self,tag): """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets""" tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'} try: return tag_dict[tag[0]] except KeyError: return None def doc_to_synsets(self,doc): """ Returns a list of synsets in document. Tokenizes and tags the words in the document doc. Then finds the first synset for each word/tag combination. If a synset is not found for that combination it is skipped. Args: doc: string to be converted Returns: list of synsets Example: doc_to_synsets('Fish are nvqjp friends.') Out: [Synset('fish.n.01'), Synset('be.v.01'), Synset('friend.n.01')] """ # Your Code Here token = nltk.word_tokenize(doc) # add parts of speech to token tag = nltk.pos_tag(token) # convert nltk pos into wordnet pos nltk2wordnet = [(i[0], self.convert_tag(i[1])) for i in tag] # if there are no synsets in token, ignore, else put in a list output = [wn.synsets(i, z)[0] for i, z in nltk2wordnet if len(wn.synsets(i, z))>0] return output def similarity_score(self,s1, s2): """ Calculate the normalized similarity score of s1 onto s2 For each synset in s1, finds the synset in s2 with the largest similarity value. Sum of all of the largest similarity values and normalize this value by dividing it by the number of largest similarity values found. Args: s1, s2: list of synsets from doc_to_synsets Returns: normalized similarity score of s1 onto s2 Example: synsets1 = doc_to_synsets('I like cats') synsets2 = doc_to_synsets('I like dogs') similarity_score(synsets1, synsets2) Out: 0.73333333333333339 """ # Your Code Here list1 = [] # For each synset in s1 for a in s1: # finds the synset in s2 with the largest similarity value # l = [i.jcn_similarity(a, brown_ic) for i in s2 if i.pos() == a.pos() and # i.jcn_similarity(a, brown_ic) is not None] # l = [i.path_similarity(a) for i in s2 if i.path_similarity(a) is not None] # l = [wn.jcn_similarity(i, a, self.brown_ic) for i in s2 if i.pos() == a.pos() and # i.pos() in self.brown_ic.keys() and a.pos() in self.brown_ic.keys() and # wn.jcn_similarity(i, a, self.brown_ic) is not None ] # Path similarity l = [i.path_similarity(a) for i in s2 if i.path_similarity(a) is not None] # Wu Palmer Similarity # l = [i.wup_similarity(a) for i in s2 if i.wup_similarity(a) is not None] # Leacock-Chodorow Similarity # l = [i.lch_similarity(a) for i in s2 if i.pos() == a.pos() and i.lch_similarity(a) is not None] if len(l) > 0: list1.append(max(l)) if len(list1) > 0: output = sum(list1)/len(list1) else: output = 0 return output def document_path_similarity(self,doc1, doc2): """Finds the symmetrical similarity between doc1 and doc2""" # first function u need to create synsets1 = self.doc_to_synsets(doc1) synsets2 = self.doc_to_synsets(doc2) # 2nd function u need to create return self.sysnset_path_similarity(synsets1, synsets2) def sysnset_path_similarity(self, synsets1, synsets2): # return self.similarity_score(synsets1, synsets2) return (self.similarity_score(synsets1, synsets2) + self.similarity_score(synsets2, synsets1)) / 2
import collections import nltk from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic from nltk.stem import WordNetLemmatizer from nltk.tokenize import sent_tokenize import prepare_results import heapq lemmatizer = WordNetLemmatizer() brown_ic = wordnet_ic.ic('ic-brown.dat') semcor_ic = wordnet_ic.ic('ic-semcor.dat') def generate_test_dictionary(): with open('data.txt', 'r', encoding='utf-8') as f: lines = [line for line in f.read().splitlines() if line] lines_dict = collections.defaultdict() for line in lines: string1 = str(line).replace('(', '{', 1).replace(',', ':', 1) string2 = string1[::-1].replace(')', '}', 1) string3 = string2[::-1] dictionary = eval(string3) lines_dict.update(dictionary) return lines_dict def lemmatize_translations(dictionary): """Normalizes translations by lemmatizing and rendering in lower case.""" lemmatizer = WordNetLemmatizer()
def __init__(self): self.scaler = StandardScaler() self.brown_ic = wordnet_ic.ic('ic-brown.dat')
continue; sent.append(define) continue; else: sent.append(word) sentence=' '.join(sent) return sentence replace_word_with_def(text,tags) # import corpuses for similarity measures from nltk.corpus import wordnet_ic brown_ic = wordnet_ic.ic('ic-brown.dat') semcor_ic = wordnet_ic.ic('ic-semcor.dat') #http://www.nltk.org/howto/wordnet.html # similarity based on path def jcn_sim(word1,word2): word1=str(word1) word2=str(word2) try: w1=wordnet.synset(wordnet.synsets(word1)[0].name()) except: return np.nan try: w2=wordnet.synset(wordnet.synsets(word2)[0].name()) except:
similarities = [ similarity_function(ss1, ss2) for ss1 in synsets1 for ss2 in synsets2 ] return max(similarities) if len(similarities) != 0 else .0 def path_similarity(synsets1, synsets2): return __max_similarity(synsets1, synsets2, wn.path_similarity) def lch_similarity(synsets1, synsets2): return __max_similarity(synsets1, synsets2, wn.lch_similarity) from nltk.corpus import wordnet_ic corpus = wordnet_ic.ic('ic-brown.dat') def lin_similarity(synsets1, synsets2): similarity_function = lambda ss1, ss2: wn.lin_similarity(ss1, ss2, corpus) return __max_similarity(synsets1, synsets2, similarity_function) def jcn_similarity(synsets1, synsets2): similarity_function = lambda ss1, ss2: wn.jcn_similarity(ss1, ss2, corpus) return __max_similarity(synsets1, synsets2, similarity_function) def res_similarity(synsets1, synsets2): similarity_function = lambda ss1, ss2: wn.res_similarity(ss1, ss2, corpus) return __max_similarity(synsets1, synsets2, similarity_function)
import nltk from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic threshold = 0.6 #treshold for wup jcnTreshold = 0.09 #jcn pathTeshold = 0.1 #path brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus lexical_chains = [] #empty list to hold all the chains dictionary = {} #empty dictionart to hold the count of each word encountered #class Chain class Chain(): def __init__(self, words, senses, count = 0): self.words = set(words) self.senses = set(senses) dictionary[words[0]] = 1 #initialize counter def addWord(self, word): if(len(self.words.intersection([word])) > 0): dictionary[word] += 1 else: dictionary[word] = 1 self.words.add(word) def addSense(self, sense): self.senses.add(sense)
def bind_kernel( features=None, # Must be provided if syntax_feature_types is True syntax_feature_types=['baseline', 'dependency', 'hand_picked'], semantic_similarity='res', include_suffix=True, syntactic_multiplier=0.33, semantic_multiplier=0.33, suffix_multiplier=0.33): ''' Returns a kernel function that has a given dictionary and features lookup bound to its scope. ''' # Validate that a sensible value for semantic similarity was provided semantic_similarity_is_valid = (semantic_similarity in LEGAL_SIMILARITIES or semantic_similarity is None) if not semantic_similarity_is_valid: raise ValueError('semantic_similarity must be one of the following: ' + ', '.join(LEGAL_SIMILARITIES) + '. Got %s.' % repr(semantic_similarity)) # Validate that a sensible value for syntactic similarity was provided syntactic_similarity_is_valid = syntax_feature_types is None or all( feature_type in LEGAL_SYNTACTIC_SIMILARITIES for feature_type in syntax_feature_types) if not syntactic_similarity_is_valid: raise ValueError( 'syntax_feature_types must be a list with any of the following: ' + ', '.join(LEGAL_SYNTACTIC_SIMILARITIES) + '. Got %s.' % repr(syntax_feature_types)) # Semantic similarity functions need an "information content" file # to calculate similarity values. if semantic_similarity is not None: information_content = wordnet_ic.ic(INFORMATION_CONTENT_FILE) def kernel(A, B): ''' Custom kernel function. This counts how often the links incident on two different words within their respective dependency trees are the same, up to the dependency relation and the POS of the neighbour. Note that A references a set of words' dependency trees, and B references another set. So that this function end up making len(A) * len(B) of such comparisons, and return the result as a len(A) by len(B) matrix. ''' result = [] for a in A: token_a = u.ensure_unicode(features.get_token(int(a[0]))) # Get token_a's dependency tree features if syntax_feature_types is not None: syntax_features_a = features.get_features_idx( int(a[0]), syntax_feature_types) # Get the token_a's synset if semantic similarity is being used if semantic_similarity is not None: semantic_features_a = nouns_only(wordnet.synsets(token_a)) if include_suffix: suffix_a = features.get_suffix(token_a) result_row = [] result.append(result_row) for b in B: kernel_score = 0 token_b = u.ensure_unicode(features.get_token(int(b[0]))) # Calculate the dependency tree kernel if syntax_feature_types is not None: syntax_features_b = features.get_features_idx( int(b[0]), syntax_feature_types) kernel_score += syntactic_multiplier * dict_dot( syntax_features_a, syntax_features_b) # Calculate semantic similarity is being used if semantic_similarity is not None: semantic_features_b = nouns_only(wordnet.synsets(token_b)) kernel_score += semantic_multiplier * max_similarity( semantic_similarity, semantic_features_a, semantic_features_b, information_content) # Determine if suffixes match if include_suffix: suffix_b = features.get_suffix(token_b) if suffix_a is not None and suffix_a == suffix_b: kernel_score += suffix_multiplier result_row.append(kernel_score) return result return kernel
import sys from nltk.corpus import wordnet as wn from nltk.corpus import wordnet_ic brown_ic = wordnet_ic.ic('ic-brown.dat') import unitConversion as uc import utils FOLD = None with open("names.txt") as f: NAMES = [x.strip() for x in f.readlines()] class aset: def __init__(self, num=None, entity=None, surface=None, idx=None): self.num = num self.entity = entity self.surface = surface self.idx = idx self.widx = (idx % 1000) + 1 if idx is not None else None self.container = None self.verbs = None self.adjs = None self.location = None self.contains = None self.compound = 0 self.subtypes = [] self.type_failure = 0 self.origs = idx // 1001 if idx is not None else None
from nltk.tag import StanfordPOSTagger from nltk.corpus import stopwords from gensim import corpora, models, similarities from collections import defaultdict from nltk.stem.porter import PorterStemmer import re import itertools import codecs import pprint as pp import operator from collections import OrderedDict from collections import Counter import json brown_ic = wordnet_ic.ic('ic-brown.dat') ic_bnc_plus1 = wordnet_ic.ic('ic-bnc-add1.dat') NERModelPath = "C:/StanfordNER/nlp/models/ner/" NERModel = "english.conll.4class.caseless.distsim.crf.ser.gz" # NOTE : the 4 classes are Person, Location, Organization, Misc NER = StanfordNERTagger(NERModelPath + NERModel) # FOR POS Tagger: POSJar = "C:/StanfordPOS/stanford-postagger.jar" POSTaggerPath = "C:/StanfordPOS/models/" POSTagger = 'english-bidirectional-distsim.tagger' POSModel = POSTaggerPath+POSTagger st = StanfordPOSTagger(POSModel, POSJar) # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer()
def similarity_by_infocontent(sense1, sense2, option): """ Returns similarity scores by information content. """ if sense1.pos != sense2.pos: # infocontent sim can't do diff POS. return 0 info_contents = ['ic-bnc-add1.dat', 'ic-bnc-resnik-add1.dat', 'ic-bnc-resnik.dat', 'ic-bnc.dat', 'ic-brown-add1.dat', 'ic-brown-resnik-add1.dat', 'ic-brown-resnik.dat', 'ic-brown.dat', 'ic-semcor-add1.dat', 'ic-semcor.dat', 'ic-semcorraw-add1.dat', 'ic-semcorraw-resnik-add1.dat', 'ic-semcorraw-resnik.dat', 'ic-semcorraw.dat', 'ic-shaks-add1.dat', 'ic-shaks-resnik.dat', 'ic-shaks-resnink-add1.dat', 'ic-shaks.dat', 'ic-treebank-add1.dat', 'ic-treebank-resnik-add1.dat', 'ic-treebank-resnik.dat', 'ic-treebank.dat'] if option in ['res', 'resnik']: return wn.res_similarity(sense1, sense2, wnic.ic('ic-bnc-resnik-add1.dat')) #return min(wn.res_similarity(sense1, sense2, wnic.ic(ic)) \ # for ic in info_contents) elif option in ['jcn', "jiang-conrath"]: return wn.jcn_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat')) elif option in ['lin']: return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))