예제 #1
0
def similarity(word1, word2, tag):
    obj1 = wn.synset(word1 + "."+ tag+".01")
    obj2 = wn.synset(word2 + "."+ tag+".01")
    #print(obj1)
    brown_ic = wordnet_ic.ic('ic-brown.dat') 	# Information content
    semcor_ic = wordnet_ic.ic('ic-brown.dat')
    value = obj1.res_similarity(obj2, brown_ic)
    return value
예제 #2
0
 def test_wordnet_similarities(self):
     # Path based similarities.
     self.assertAlmostEqual(S('cat.n.01').path_similarity(S('cat.n.01')), 1.0)
     self.assertAlmostEqual(S('dog.n.01').path_similarity(S('cat.n.01')), 0.2)
     self.assertAlmostEqual(S('dog.n.01').lch_similarity(S('cat.n.01')), 2.028, places=3)
     self.assertAlmostEqual(S('dog.n.01').wup_similarity(S('cat.n.01')), 0.8571, places=3)
     # Information Content similarities.
     brown_ic = wnic.ic('ic-brown.dat')
     self.assertAlmostEqual(S('dog.n.01').jcn_similarity(S('cat.n.01'), brown_ic), 0.4497, places=3)
     semcor_ic = wnic.ic('ic-semcor.dat')
     self.assertAlmostEqual(S('dog.n.01').lin_similarity(S('cat.n.01'), semcor_ic), 0.8863, places=3)
예제 #3
0
파일: answer.py 프로젝트: danigarabato/qa
    def _other_recognition(self, tagged_sentences, all_entities, question):
        # Nouns retrieval
        nouns = []
        for sentence in tagged_sentences:
            nouns += filter(lambda x: x[1] == "NN", sentence)
        nouns = [noun for (noun, tag) in nouns]

        # Nouns filtering
        # Remove all entities that are nouns
        all_entities = set(itertools.chain(*map(str.split, all_entities)))
        nouns = [noun for noun in nouns if noun not in all_entities]

        features = QuestionClassifier.get_features(question.text, "hn")
        head = features["head"]
        if head == "":
            return nouns

        # Filter nouns with WordNet synsets
        try:
            threshold = float(MyConfig.get("answer_extraction", "other_threshold"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            threshold = 0.6

        try:
            ic = wordnet_ic.ic(MyConfig.get("answer_extraction", "ic"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            ic = wordnet_ic.ic("ic-bnc.dat")

        result = []

        head_synsets = wn.synsets(head, pos=wn.NOUN)
        if len(head_synsets) == 0:
            noun_synsets = wn.synsets(features["noun"], pos=wn.NOUN)
            if len(noun_synsets) == 0:
                return nouns
            else:
                head_synset = noun_synsets[0]
        else:
            head_synset = head_synsets[0]

        for noun in nouns:
            try:
                noun_synset = wn.synsets(noun, pos=wn.NOUN)[0]
                if threshold < noun_synset.lin_similarity(head_synset, ic) < 0.9:
                    result.append(noun)
            except IndexError:
                continue

        return result
예제 #4
0
def test():
    col = nltk.TextCollection(nltk.corpus.brown)
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    sc = SimilarityCalculator(col, 'bp', brown_ic)
    sentence1 = preprocess("The jurors were taken into the courtroom in groups of 40 and asked to fill out a questionnaire.")
    sentence2 = preprocess("About 120 potential jurors were being asked to complete a lengthy questionnaire.")
    print sc.similarity_bidirectional(sentence1, sentence2)
예제 #5
0
파일: sim_link.py 프로젝트: agarsev/grafeno
 def __init__ (self, sim_threshold = 0.1, sim_weight = 1, **kwds):
     global brown_ic
     super().__init__(**kwds)
     if not brown_ic:
         brown_ic = wordnet_ic.ic('ic-brown.dat')
     self.__threshold = sim_threshold
     self.__weight = sim_weight
 def get_similarity(self, synsets1, synsets2):
     brown_ic = wordnet_ic.ic("ic-brown.dat")
     max_value = 0
     for synset1 in synsets1:
         for synset2 in synsets2:
             value = wn.res_similarity(synset1, synset2, brown_ic)
             if value > max_value:
                 max_value = value
     return max_value
예제 #7
0
def sensesim(ss1,ss2,metric):

    if metric=='path':
        sim=ss1.path_similarity(ss2)
    elif metric=='lin':
        sim=ss1.lin_similarity(ss2,wn_ic.ic('ic-brown.dat'))
    elif metric=='jcn':
        sim=ss1.jcn_similarity(ss2,wn_ic.ic('ic-brown.dat'))
    elif metric=='res':
        sim=ss1.res_similarity(ss2,wn_ic.ic('ic-brown.dat'))
    elif metric=='lch':
        sim=ss1.lch_similarity(ss2)
    elif metric=='wup':
        sim=ss1.wup_similarity(ss2)
    else:
        print "Unknown metric", metric
        sim=0
    return sim
    def __init__(self,parameters):

        self.parameters=parameters
        self.wn_sim=self.parameters.get("wn_sim",Analyser.simmetric)
        self.ic=wn_ic.ic('ic-semcor.dat')
        self.candidates={}
        self.synsetthresh=self.parameters.get("synset_thresh",Analyser.synsetthresh)
        self.totalthresh=self.parameters.get("total_thresh",Analyser.totalthresh)
        self.propthresh=self.parameters.get("prop_thresh",Analyser.propthresh)
        self.simthresh=self.parameters.get("sim_thresh",Analyser.simthresh)
    def get_lin_distance(self, word1, word2):
        brown_ic = wordnet_ic.ic('ic-brown.dat')
        if len(wn.synsets(word1)) == 0 or len(wn.synsets(word2)) == 0:
            return 0

        target1 = wn.synsets(word1)[0]
        target2 = wn.synsets(word2)[0]

        try:
            result = target1.lin_similarity(target2, brown_ic)
            return result
        except:
            return 0
예제 #10
0
파일: semanticsim.py 프로젝트: alee101/wsd
def similarity_by_path(sense1, sense2, option="path"):
  """ Returns maximum path similarity between two senses. """
  if option.lower() in ["path", "path_similarity"]: # Path similaritys
    return max(wn.path_similarity(sense1,sense2), 
               wn.path_similarity(sense1,sense2))
  elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer 
    return wn.wup_similarity(sense1, sense2)
  elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow
    if sense1.pos != sense2.pos: # lch can't do diff POS
      return 0
    return wn.lch_similarity(sense1, sense2)

    return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
def single_jiang_conrath(cast_no1, cast_no2, syn_dict):
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    synsets1 = syn_dict[cast_no1]
    synsets2 = syn_dict[cast_no2]
    total_sim = 0.0
    no_of_comparisons = 0.0
    for original_syn in synsets1:
        for syn1 in synsets1:
            if len(synsets1) is not 0 and syn1 is not None:
                for syn2 in synsets2:
                    if len(synsets2) is not 0 and syn2 is not None and syn1.pos()==syn2.pos() and ((syn1.pos() == "n") or (syn2.pos() == "v")):
                        sim = syn1.lch_similarity(syn2, brown_ic)
                        total_sim = total_sim + sim
                        no_of_comparisons+=1
    return total_sim/no_of_comparisons
예제 #12
0
def main():
    brown_ic = wordnet_ic.ic('ic-brown.dat')

    human_sims = parseFile("input.txt")

    lin_sims = linSimilarities(human_sims.keys(), brown_ic)
    res_sims = resSimilarities(human_sims.keys(), brown_ic)
    #print "Initializing Model"
    model = None
    model = gensim.models.Word2Vec()
    model = model.load_word2vec_format(RESOURCES+'glove_model.txt', binary=False)
    #print "Model created calling vec Sim"
    vec_sims = vecSimilarities(human_sims.keys(), model)
    #print "AFter call to vec Sim"
    lin_score = 0
    res_score = 0
    vec_score = 0

    print '{0:15} {1:15} {2:10} {3:20} {4:20} {5:20}'.format('word1','word2', 
                                                             'human', 'Lin', 
                                                             'Resnik', 'Word2Vec')
    for key, human in human_sims.items():
        try:
            lin = lin_sims[key]
        except:
            lin = 0
        lin_score += (lin - human) ** 2
        try:
            res = res_sims[key]
        except:
            res = 0
        res_score += (res - human) ** 2
        try:
            vec = vec_sims[key]
        except:
            vec = 0
        vec_score += (vec - human) ** 2
	firstword=key.partition('(')[-1].rpartition(',')[0]
	secondword=key.partition(',')[-1].rpartition(')')[0]
        secondword=secondword.strip()
	print '{0:15} {1:15} {2:10} {3:20} {4:20} {5:20}'.format(firstword,secondword, human, 
                                                                 lin, res, vec)

    num_examples = len(human_sims)
    print "\nMean Squared Errors"
    print "Lin method error: %0.2f" % (lin_score/num_examples) 
    print "Resnick method error: %0.2f" % (res_score/num_examples)
    print "Vector-based method error: %0.2f" % (vec_score/num_examples)
예제 #13
0
def lexical_compare(lemma_text,lemma_hypothesis):
	similarity_score = 0
	brown_ic = wordnet_ic.ic('ic-brown.dat')
	if re.search(lemma_text,lemma_hypothesis,re.M|re.I):
		return 50
	hypo_synset = wn.synsets(lemma_hypothesis)
	text_synset = wn.synsets(lemma_text)
	synset_index = get_index(hypo_synset, text_synset)
	if synset_index == -1:
		return 0	
	if len(hypo_synset) > 0 and len(text_synset) > 0:
		similarity_score = hypo_synset[synset_index].path_similarity(text_synset[0],brown_ic)
		similarity_score += hypo_synset[synset_index].wup_similarity(text_synset[0],brown_ic)  
		similarity_score += hypo_synset[synset_index].lin_similarity(text_synset[0],brown_ic)  	
		similarity_score += hypo_synset[synset_index].res_similarity(text_synset[0],brown_ic)  	
	return similarity_score
def extract_word_clusters(commentList, commentCount):
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    a, corpus, global_synsets = extract_global_bag_of_words(commentList, True)
    similarity_dict = {}
    i = 0
    t = len(global_synsets)**2
    
    for syn_out in global_synsets:
        similarity_dict[syn_out] = {} 
        for syn_in in global_synsets:
            if syn_in.pos() == syn_out.pos():
                similarity_dict[syn_out][syn_in] = syn_out.lin_similarity(syn_in, brown_ic)
            else:
                similarity_dict[syn_out][syn_in] = max(wn.path_similarity(syn_out,syn_in), wn.path_similarity(syn_in,syn_out))
        
            if i % 10000 == 0:
                print i, 'synsets processed out of',len(global_synsets)**2, '(',float(i)/(t),'%)'
            i += 1

    tuples = [(i[0], i[1].values()) for i in similarity_dict.items()] 
    vectors = [np.array(tup[1]) for tup in tuples]

    
    # Rule of thumb
    n = sqrt(len(global_synsets)/2)
    print "Number of clusters", n
    km_model = KMeans(n_clusters=n)
    km_model.fit(vectors)
    
    clustering = collections.defaultdict(list)
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(tuples[idx][0])
        
    pprint.pprint(dict(clustering), width=1)
    
    feature_vector = np.zeros([len(corpus),n])
    
    for i,comment in enumerate(corpus):
        for w in comment:
            for key, clust in clustering.items():
                if w in clust:
                    feature_vector[i][key] += 1
        if i % 1000 == 0:
            print i, 'comments processed'
        
    print feature_vector
    '''
예제 #15
0
파일: wnsim.py 프로젝트: julieweeds/ANLE
def wnsim(word1,word2,ps = wn.NOUN, metric='path',ic=wn_ic.ic('ic-brown.dat')):

    #function to calculate wn similarity of two words
    #maximises similarity over all senses of the given part of speech (by default noun)

    ss1=wn.synsets(word1,pos=ps)
    ss2=wn.synsets(word2,pos=ps)

    maxsim = 0
    for s1 in ss1:
        for s2 in ss2:
            thissim = sssim(s1,s2,metric,ic)
            if thissim>maxsim:
                maxsim=thissim

    #print maxsim
    return maxsim
예제 #16
0
def computeLinSimilarity(term1, term2):
    global ic
    if not ic:
        #ic = wordnet_ic.ic('ic-semcor.dat')
        ic = wordnet_ic.ic('ic-brown.dat')
    w1_syns = wn.synsets(term1)
    w2_syns = wn.synsets(term2)
    maxsim = 0
    for w1s in w1_syns:
        for w2s in w2_syns:
            try:
                sim = wn.lin_similarity(w1s, w2s, ic)
                if sim > maxsim:
                    maxsim = sim
            except Exception:
                pass
    return maxsim
예제 #17
0
def lin_truth():
    semcor_ic = wordnet_ic.ic('ic-semcor.dat')
    content = [word.strip() for word in open(Input2)]
    truth_arr = []

    for i in content:
        similarity = []
        synA = wordnet.synset(i + ".n.01")

        for j in content:
            synB = wordnet.synset(j + ".n.01")
            sim = synA.lin_similarity(synB, semcor_ic)
            similarity.append(sim)

        truth_arr.append(similarity)

    D = ss.csr_matrix(np.array(truth_arr, dtype=np.float64))
    return D
예제 #18
0
파일: B.py 프로젝트: saniaarif22/NLP
def main():
    brown_ic = wordnet_ic.ic('ic-brown.dat')

    human_sims = parseFile("input.txt")

    lin_sims = linSimilarities(human_sims.keys(), brown_ic)
    res_sims = resSimilarities(human_sims.keys(), brown_ic)

    model = None
    model = gensim.models.Word2Vec()
    model = model.load_word2vec_format(RESOURCES+'glove_model.txt', binary=False)
    vec_sims = vecSimilarities(human_sims.keys(), model)
    
    lin_score = 0
    res_score = 0
    vec_score = 0

    print '{0:15} {1:15} {2:10} {3:20} {4:20} {5:20}'.format('word1','word2', 
                                                             'human', 'Lin', 
                                                             'Resnik', 'Word2Vec')
    for key, human in human_sims.items():
        try:
            lin = lin_sims[key]
        except:
            lin = 0
        lin_score += (lin - human) ** 2
        try:
            res = res_sims[key]
        except:
            res = 0
        res_score += (res - human) ** 2
        try:
            vec = vec_sims[key]
        except:
            vec = 0
        vec_score += (vec - human) ** 2
        print '{0:15} {1:15} {2:10} {3:20} {4:20} {5:20}'.format(key[0], key[1], human, 
                                                                 lin, res, vec)

    num_examples = len(human_sims)
    print "\nMean Squared Errors"
    print "Lin method error: %0.2f" % (lin_score/num_examples) 
    print "Resnick method error: %0.2f" % (res_score/num_examples)
    print "Vector-based method error: %0.2f" % (vec_score/num_examples)
예제 #19
0
def main(fname):
    lyrics = preprocess_lyrics(fname)
    collection = nltk.TextCollection(nltk.corpus.brown)
    ic = wordnet_ic.ic('ic-brown.dat')
    thresh_counts = {}
    for similarity in SimilarityCalculator.SIMILARITIES.keys() + ['bp_adj']:
        scores = []
        output_fname = os.path.join('output', similarity + '.txt')
        pickled_fname = output_fname + '.pickled'
        img_fname = os.path.join('output', similarity + '_hist.png')
        if os.path.exists(output_fname):
            continue
        now = datetime.datetime.now()
        print '[{}] Starting calculation on {}'.format(str(now), similarity)
        if similarity == 'bp':
            adjust_bp()
        if os.path.exists(pickled_fname):
            scores = [score for couplet, score in
                      pickle.load(open(pickled_fname, 'r'))]
        else:              
            sc = SimilarityCalculator(collection, similarity, ic)
            for lyric1, lyric2 in pairwise(lyrics):
                scores.append(sc.similarity_bidirectional(lyric1, lyric2))
        thresh_counts[similarity] = print_report(open(output_fname, 'w'),
                                                 scores,
                                                 open(fname, 'r').read().
                                                 split('\n'),
                                                 open(pickled_fname, 'w'),
                                                 img_fname)
        now = datetime.datetime.now()
        print '[{}] Finished calculation on {}'.format(str(now), similarity)
    plt.clf()
    for similarity in thresh_counts.keys():
        res = list(thresh_counts[similarity].iteritems())
        res.sort()
        res = zip(*res)
        plt.plot(res[0], res[1], label=similarity, zorder=1)
        plt.scatter(res[0], res[1], zorder=2)
    plt.legend()
    plt.xlabel("threshold")
    plt.ylabel("no. lyrics selected")
    plt.savefig(os.path.join("output", "thresholds.png"))
예제 #20
0
def check_robustpca(trainCollection, testCollection, feature):
    ready = True
    
    # check matlab    
    if not check_matlab():
        print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'Matlab is not available or incorrectly configured.')
        ready = False
    
    # check if knn is available
    if not check_knn(trainCollection, testCollection, feature):
        print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'KNN is not available.')        
        ready = False

    # check data files
    datafiles = [ os.path.join(ROOT_PATH, trainCollection, 'TextData', 'id.userid.lemmtags.txt'),
                  os.path.join(ROOT_PATH, trainCollection, 'FeatureData', feature)]
    res = find_missing_files(datafiles)
    if res:
        print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'the following files or folders are missing:\n%s' % res)
        return False    
              
    # check external dependencies  
    try:
        import h5py
        import numpy
        import scipy.io
        import scipy.sparse
        from nltk.corpus import wordnet as wn
        from nltk.corpus import wordnet_ic
        brown_ic = wordnet_ic.ic('ic-brown.dat')
        wn.morphy('cat')
        wn.synsets('cat', pos=wn.NOUN)
    except Exception, e:
        try:
            import nltk
            nltk.download('brown')
            nltk.download('wordnet')
            nltk.download('wordnet_ic')
        except Exception, e:
            print e
            ready = False
def jiang_conrath(syn_dict, cast_no):
    this_cast_syns = syn_dict[cast_no]
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    jc_sims = OrderedDict()
    i = 0
    while i < len(syn_dict):
        total_sim = 0.0
        key = 'cast' + `i`
        no_of_comparisons = 0.0
        for original_syn in this_cast_syns:
            if len(this_cast_syns) is not 0 and original_syn is not None:
                for comparison_syn in syn_dict[key]:
                    if len(syn_dict[key]) is not 0 and comparison_syn is not None and original_syn.pos()==comparison_syn.pos() and ((original_syn.pos() == "n") or (original_syn.pos() == "v")):
                        sim = original_syn.lch_similarity(comparison_syn, brown_ic)
                        total_sim = total_sim + sim
                        no_of_comparisons+=1
        i+=1
        if no_of_comparisons is not 0.0:
            jc_sims[key] = total_sim/no_of_comparisons
        else:
            jc_sims[key] = total_sim
    return jc_sims
예제 #22
0
파일: dst.py 프로젝트: f00barin/distrib
    def jcn(self):
        semcor_ic = wordnet_ic.ic('ic-semcor.dat')
        content_a = [word.strip() for word in open(self.wordset_a)]
        content_b = [word.strip() for word in open(self.wordset_b)]

        truth_mat = np.zeros(shape=(len(content_a), len(content_b)))

        x = 0

        for i in content_a:
            y = 0
            synA = wordnet.synset(i + ".n.01")

            for j in content_b:

                synB = wordnet.synset(j + ".n.01")
                sim = synA.jcn_similarity(synB, semcor_ic)
                truth_mat[x, y] = sim
                y += 1
            x += 1

        return truth_mat
예제 #23
0
def writeTranslatedWN(mapping, output, balanced=True):
  from nltk.corpus import wordnet_ic
  from nltk.corpus import wordnet as wn
  ic = wordnet_ic.ic("ic-bnc-resnik-add1.dat")

  removed = set([])

  # We gave them hyponym counts, so we don't need to propagate counts
  o = OntologyWriter(output, propagate_counts=False)

  for ii in orderedTraversal(wn, pos='n', reverse_depth=True):
    children = [x.offset for x in ii.hyponyms() + ii.instance_hyponyms() \
                if not x.offset in removed]
    words = mapSynsetWords(mapping, ii, balanced)
    if len(children) == 0 and len(words) == 0:
      removed.add(ii.offset)

  print ("%i synsets removed" % len(removed))
  for ii in orderedTraversal(wn, pos='n'):
    children = [x.offset for x in ii.hyponyms() + ii.instance_hyponyms() \
                if not x.offset in removed]
    hyponym_count = sum(ic['n'][x] for x in children)
    information_contribution = ic['n'][ii.offset] - hyponym_count

    words = mapSynsetWords(mapping, ii, balanced)

    assert information_contribution > 0.0 or information_contribution == 0.0 \
           and len(ii.lemmas) == 0, "Synset %i had no information" % ii.offset
    if len(words) > 0:
      per_word_contribution = information_contribution / float(len(words))
      words = [x + (per_word_contribution,) for x in words]

    # Add synsets if they're not vestigial leaves
    if len(children) > 0 or len(words) > 0:
      o.AddSynset(ii.offset, ii.name, children, words, hyponym_count)
  o.Finalize()
예제 #24
0
def wordnet_similarity(words, sim_measure, wnlabels):
    sims = {}
    brown_ic = wordnet_ic.ic('ic-brown.dat')

    for word in words:
        w = word[:word.find(":")]
        senses = wn.synsets(w, wn.NOUN)
        if len(senses) < 1: continue #TODO
        right_sense = wnlabels._correct_sense(senses, w)

        targets = []

        for othword in words:
            if word == othword: continue

            othw = othword[:othword.find(":")]
            othw_senses = wn.synsets(othw, wn.NOUN)
            if len(othw_senses) < 1: continue
            othw_right_sense = wnlabels._correct_sense(othw_senses, othw)

            #print w, othw, senses[right_sense], othw_senses[othw_right_sense]

            if sim_measure == "jcn":
                sim = senses[right_sense].jcn_similarity(othw_senses[othw_right_sense], brown_ic)
            elif sim_measure ==  "wup":
                sim = senses[right_sense].wup_similarity(othw_senses[othw_right_sense])
            elif sim_measure == "path":
                sim = senses[right_sense].path_similarity(othw_senses[othw_right_sense])


            targets.append([othword, sim])

        targets = sorted(targets, reverse=True, key=itemgetter(1))
        #print word, targets
        sims[word] = targets
    return sims
예제 #25
0
 def __init__(self):
     self.brown_ic = wordnet_ic.ic('ic-brown.dat')
예제 #26
0
    def __init__(self, utterance_sep, path_output_lu_parses, path_output_parses,
                 parser_path, cfg_rules_path, pos_tagger_path=None, path_to_freq_norms=None, path_to_image_norms=None,
                 path_to_dictionary=None, lu_analyzer_path=None, path_to_anew=None, path_to_warringer=None, do_wnic=False,
                 path_to_rst_python=None, path_to_rst=None, path_output_rst=None, path_to_stanford_cp=None,
                 path_to_mpqa_lexicon=None, path_to_lda_model=None, path_to_lda_wordids=None, do_lexical=True,
                 do_syntactic=True, do_semantic=True, do_pragmatic=False, lexical_list=None, syntactic_list=None,
                 semantic_list=None, pragmatic_list=None):
        '''Parameters:
        source_transcript : list of strings. Full paths to directories containing transcripts (with no filler annotations)
        source_transcript_fillers : list of string. Full paths to a directories containing transcripts with filler annotations
        utterance_sep : string. The string that delimits utterance boundaries in the transcript
        path_lu_output_parses : string. The absolute path to a directory that will store the Lu features and parses.
        path_output_parses : string. The absolute path to a directory that will store the parse trees produced for the data.
        parser_path : string. The absolute path to a directory containing a Stanford lexparser
        cfg_rules_path : string. The absolute path to a file containing cfg productions to be extracted (one per line)
        path_output_lda_topics: string. The absolute path to the csv file where key-value topics will be stored.
        pos_tagger_path : optional, string. Full path to a directory containing a Stanford POS tagger
        path_to_freq_norms : optional, string. Full path to a file containing frequency norms
        path_to_image_norms : optional, string. Full path to a file containing imageability norms
        path_to_dictionary : optional, string. Full path to a file containing valid words for the language
        lu_analyzer_path : optional
        path_to_rst_python : optional, string. Full path to virtualenv python, for RST
        path_to_rst : optional, string. Full path to folder with RST's 'parse.py'
        path_output_rst: optional, string. Full path to where RST stores its results
        path_to_lda_model : string. Full path to trained LDA model.
        path_to_lda_wordids : string. Full path to word IDs used in trained LDA model.
        '''

        self.utterance_sep = utterance_sep

        self.output_rst_dir = os.path.abspath(path_output_rst)
        self.output_parse_dir = os.path.abspath(path_output_parses)
        self.output_lu_parse_dir = os.path.abspath(path_output_lu_parses)

        self.pos_tagger_path = pos_tagger_path
        self.parser_path = parser_path
        self.cfg_rules_path = cfg_rules_path
        self.path_to_mpqa_lexicon = path_to_mpqa_lexicon
        self.path_to_rst_python = path_to_rst_python
        self.path_to_rst = path_to_rst
        self.path_to_stanford_cp = path_to_stanford_cp
        self.path_to_lda_model = path_to_lda_model
        self.path_to_lda_wordids = path_to_lda_wordids

        self.do_lexical = do_lexical
        self.do_syntactic = do_syntactic
        self.do_semantic = do_semantic
        self.do_pragmatic = do_pragmatic
        self.lexical_list = lexical_list
        self.syntactic_list = syntactic_list
        self.semantic_list = semantic_list
        self.pragmatic_list = pragmatic_list

        file_utils.ensure_dir(self.output_parse_dir)
        file_utils.ensure_dir(self.output_lu_parse_dir)
        file_utils.ensure_dir(self.output_rst_dir)

        # self.transcript_set = transcript.TranscriptSet(dataset=[])

        # Get lexical norms
        if path_to_freq_norms is not None:
            self.norms_freq = functions.get_frequency_norms(path_to_freq_norms)
        else: # default
            self.norms_freq = functions.get_frequency_norms()

        if path_to_image_norms is not None:
            self.norms_image = functions.get_imageability_norms(path_to_image_norms)
        else: # default
            self.norms_image = functions.get_imageability_norms()

        if path_to_anew is not None:
            self.norms_anew = functions.get_anew_norms(path_to_anew)
        else: # default
            self.norms_anew = None

        # Warringer
        if path_to_warringer is not None:
            self.norms_warringer = functions.get_warringer_norms(path_to_warringer)
        else: # default
            self.norms_warringer = functions.get_warringer_norms()

        # MPQA
        if path_to_mpqa_lexicon is not None:
            [self.mpqa_words, self.mpqa_types, self.mpqa_polarities] = functions.get_mpqa_lexicon(path_to_mpqa_lexicon)
        else: # default
            [self.mpqa_words, self.mpqa_types, self.mpqa_polarities] = functions.get_mpqa_lexicon()

        # Set up the dictionary of valid words for the language
        if path_to_dictionary is not None:
            source_dict = path_to_dictionary
        else:
            source_dict = os.path.abspath("../feature_extraction/text/american-english") # default
        with open(source_dict, 'r') as fin_dict:
            words = fin_dict.readlines()
            self.dictionary_words = set(word.strip().lower() for word in words)
        self.prondict = cmudict.dict()

        if lu_analyzer_path is not None:
            self.lu_analyzer_path = lu_analyzer_path
        else:
            self.lu_analyzer_path = os.path.abspath('../L2SCA-2011-10-10/')

        # semantics
        if do_wnic:
            self.brown_ic = wnic.ic('ic-brown.dat')      # FR: it would be nice to have a dat based on normative data, baby
            self.semcor_ic = wnic.ic('ic-semcor.dat')
        else:
            self.brown_ic = []
            self.semcor_ic = []
예제 #27
0
def sim_wordnet(wordpairs, filename):
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    semcor_ic = wordnet_ic.ic('ic-semcor.dat')

    WORDNET_DIR = os.path.join(RESULT_DIR, "wordnet")

    pathfile = file(os.path.join(WORDNET_DIR, "path_" + filename), 'wb')
    pathwriter = csv.writer(pathfile)

    wupfile = file(os.path.join(WORDNET_DIR, "wup_" + filename), 'wb')
    wupwriter = csv.writer(wupfile)

    lchfile = file(os.path.join(WORDNET_DIR, "lch_" + filename), 'wb')
    lchwriter = csv.writer(lchfile)

    resfile = file(os.path.join(WORDNET_DIR, "res_" + filename), 'wb')
    reswriter = csv.writer(resfile)

    jcnfile = file(os.path.join(WORDNET_DIR, "jcn_" + filename), 'wb')
    jcnwriter = csv.writer(jcnfile)

    linfile = file(os.path.join(WORDNET_DIR, "lin_" + filename), 'wb')
    linwriter = csv.writer(linfile)

    resultfiles = [pathfile, wupfile, lchfile, resfile, jcnfile, linfile]
    resultwriters = [
        pathwriter, wupwriter, lchwriter, reswriter, jcnwriter, linwriter
    ]

    for wordpair in wordpairs:
        synsets1 = wordnet.synsets(wordpair[0])
        synsets2 = wordnet.synsets(wordpair[1])

        path_sim = -100
        wup_sim = -100
        lch_sim = -100

        res_sim = -100
        jcn_sim = -100
        lin_sim = -100

        for tmpword1 in synsets1:
            for tmpword2 in synsets2:
                if tmpword1.pos() == tmpword2.pos():
                    try:
                        path_sim = max(path_sim,
                                       tmpword1.path_similarity(tmpword2))
                    except Exception, e:
                        print tmpword1, tmpword2
                        print "path: " + str(e)

                    try:
                        wup_sim = max(wup_sim,
                                      tmpword1.wup_similarity(tmpword2))
                    except Exception, e:
                        print tmpword1, tmpword2
                        print "wup: " + str(e)

                    try:
                        lch_sim = max(lch_sim,
                                      tmpword1.lch_similarity(tmpword2))
                    except Exception, e:
                        print tmpword1, tmpword2
                        print "lch: " + str(e)

                    try:
                        res_sim = max(
                            res_sim,
                            tmpword1.res_similarity(tmpword2, brown_ic))
                    except Exception, e:
                        print tmpword1, tmpword2
                        print "res: " + str(e)

                    try:
                        jcn_sim = max(
                            jcn_sim,
                            tmpword1.jcn_similarity(tmpword2, brown_ic))
                    except Exception, e:
                        print tmpword1, tmpword2
                        print "jcn: " + str(e)
예제 #28
0
from nltk.corpus import wordnet_ic

def wm_subjects(subjects):
    wm_subjects = []
    for subject in subjects:
        wm_subjects.append(wn.synsets(subject, pos=wn.NOUN)[0])
    return wm_subjects

def match_subjects(wm_subjects):
    match_subjects = []
    aux = []
    for subi in wm_subjects:
        for subj in wm_subjects:
            aux.append(subi.res_similarity(subj, brown_ic)/subi.res_similarity(subi, brown_ic))
        match_subjects.append(aux)
        aux = []
    return match_subjects

subjects = []
with open("subjects.txt", "rt") as fin:
    for line in fin:
        subjects.append(line.replace('\n', ''))

wm_subjects = wm_subjects(subjects)

brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus to compute the IC

match_subjects = match_subjects(wm_subjects)

print(match_subjects)
예제 #29
0
def sim_wordnet(wordpairs, filename):
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    semcor_ic = wordnet_ic.ic('ic-semcor.dat')

    WORDNET_DIR = os.path.join(RESULT_DIR, "wordnet")

    pathfile = open(os.path.join(WORDNET_DIR, "path_" + filename),
                    'w',
                    newline='')
    pathwriter = csv.writer(pathfile)

    wupfile = open(os.path.join(WORDNET_DIR, "wup_" + filename),
                   'w',
                   newline='')
    wupwriter = csv.writer(wupfile)

    lchfile = open(os.path.join(WORDNET_DIR, "lch_" + filename),
                   'w',
                   newline='')
    lchwriter = csv.writer(lchfile)

    resfile = open(os.path.join(WORDNET_DIR, "res_" + filename),
                   'w',
                   newline='')
    reswriter = csv.writer(resfile)

    jcnfile = open(os.path.join(WORDNET_DIR, "jcn_" + filename),
                   'w',
                   newline='')
    jcnwriter = csv.writer(jcnfile)

    linfile = open(os.path.join(WORDNET_DIR, "lin_" + filename),
                   'w',
                   newline='')
    linwriter = csv.writer(linfile)

    resultfiles = [pathfile, wupfile, lchfile, resfile, jcnfile, linfile]
    resultwriters = [
        pathwriter, wupwriter, lchwriter, reswriter, jcnwriter, linwriter
    ]

    for wordpair in wordpairs:
        synsets1 = wordnet.synsets(wordpair[0])
        synsets2 = wordnet.synsets(wordpair[1])

        path_sim = -100
        wup_sim = -100
        lch_sim = -100

        res_sim = -100
        jcn_sim = -100
        lin_sim = -100

        for tmpword1 in synsets1:
            for tmpword2 in synsets2:
                if tmpword1.pos() == tmpword2.pos():
                    try:
                        path_sim = max(path_sim,
                                       tmpword1.path_similarity(tmpword2))
                    except Exception as e:
                        print(tmpword1, tmpword2)
                        print("path: " + str(e))

                    try:
                        wup_sim = max(wup_sim,
                                      tmpword1.wup_similarity(tmpword2))
                    except Exception as e:
                        print(tmpword1, tmpword2)
                        print("wup: " + str(e))

                    try:
                        lch_sim = max(lch_sim,
                                      tmpword1.lch_similarity(tmpword2))
                    except Exception as e:
                        print(tmpword1, tmpword2)
                        print("lch: " + str(e))

                    try:
                        res_sim = max(
                            res_sim,
                            tmpword1.res_similarity(tmpword2, brown_ic))
                    except Exception as e:
                        print(tmpword1, tmpword2)
                        print("res: " + str(e))

                    try:
                        jcn_sim = max(
                            jcn_sim,
                            tmpword1.jcn_similarity(tmpword2, brown_ic))
                    except Exception as e:
                        print(tmpword1, tmpword2)
                        print("jcn: " + str(e))

                    try:
                        lin_sim = max(
                            lin_sim,
                            tmpword1.lin_similarity(tmpword2, semcor_ic))
                    except Exception as e:
                        print(tmpword1, tmpword2)
                        print("lin: " + str(e))

        path_result = (wordpair[0], wordpair[1], path_sim)
        wup_result = (wordpair[0], wordpair[1], wup_sim)
        lch_result = (wordpair[0], wordpair[1], lch_sim)
        res_result = (wordpair[0], wordpair[1], res_sim)
        jcn_result = (wordpair[0], wordpair[1], jcn_sim)
        lin_result = (wordpair[0], wordpair[1], lin_sim)

        results = [
            path_result, wup_result, lch_result, res_result, jcn_result,
            lin_result
        ]

        for i in range(len(resultwriters)):
            writer = resultwriters[i]
            writer.writerow(results[i])

    for resultfile in resultfiles:
        resultfile.close()
예제 #30
0
    def __init__(self):
        parser = argparse.ArgumentParser(
            description="Run the Codenames AI competition game.",
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        parser.add_argument("codemaster",
                            help="Path to codemaster package or 'human'")
        parser.add_argument("guesser",
                            help="Path to guesser package or 'human'")
        parser.add_argument(
            "--seed",
            help="Random seed value for board state -- integer or 'time'",
            default='time')
        parser.add_argument("--w2v",
                            help="Path to w2v file or 'none'",
                            default='none')
        parser.add_argument("--glove_cm",
                            help="Path to glove file or 'none'",
                            default='none')
        parser.add_argument("--glove_guesser",
                            help="Path to glove file or 'none'",
                            default='none')
        parser.add_argument(
            "--wordnet",
            help="Name of wordnet file or 'none', most like ic-brown.dat",
            default='none')

        args = parser.parse_args()

        # if the game is going to have an ai, load up word vectors
        if sys.argv[1] != "human" or sys.argv[2] != "human":
            brown_ic = None
            if args.wordnet != 'none':
                brown_ic = wordnet_ic.ic(args.wordnet)
            glove_vecs_cm = {}
            if args.glove_cm != 'none':
                with open(args.glove_cm, encoding="utf-8") as infile:
                    for line in infile:
                        line = line.rstrip().split(' ')
                        glove_vecs_cm[line[0]] = np.array(
                            [float(n) for n in line[1:]])
                print('loaded glove cm vectors')

            glove_vecs_guesser = {}
            if args.glove_guesser != 'none':
                with open(args.glove_guesser, encoding="utf-8") as infile:
                    for line in infile:
                        line = line.rstrip().split(' ')
                        glove_vecs_guesser[line[0]] = np.array(
                            [float(n) for n in line[1:]])
                print('loaded glove guesser vectors')

            word_vectors = {}
            if args.w2v != 'none':
                word_vectors = word2vec.KeyedVectors.load_word2vec_format(
                    args.w2v, binary=True, unicode_errors='ignore')
                print('loaded word vectors')

        if args.codemaster == "human":
            self.codemaster = human_codemaster()
            print('human codemaster')
        else:
            codemaster_module = importlib.import_module(args.codemaster)
            self.codemaster = codemaster_module.ai_codemaster(
                brown_ic, glove_vecs_cm, word_vectors)
            print('loaded codemaster')

        if args.guesser == "human":
            self.guesser = human_guesser()
            print('human guesser')
        else:
            guesser_module = importlib.import_module(args.guesser)
            self.guesser = guesser_module.ai_guesser(brown_ic,
                                                     glove_vecs_guesser,
                                                     word_vectors)
            print('loaded guesser')

        self.seed = 'time'
        if args.seed != 'time':
            self.seed = args.seed
            random.seed(int(args.seed))

        f = open("game_wordpool.txt", "r")

        if f.mode == 'r':
            temp_array = f.read().splitlines()
            self.words = set([])
            # if duplicates were detected and the set length is not 25 then restart
            while len(self.words) != 25:
                self.words = set([])
                for x in range(0, 25):
                    random.shuffle(temp_array)
                    self.words.add(temp_array.pop())
            self.words = list(sorted(self.words))
            random.shuffle(self.words)

        self.maps = ["Red"] * 8 + ["Blue"] * 7 + ["Civilian"] * 9 + [
            "Assassin"
        ]
        random.shuffle(self.maps)
예제 #31
0
def calculate_sim_matrix_from_list(word_list,
                                   methods_list,
                                   word_pos='n',
                                   full_synsets=False,
                                   all_matrix=False):

    print('calculate_sim_matrix_from_list started')

    content_dict = {}
    noun_to_noun_sim_matrices = {}
    unknown_words = {}

    word_list_size = len(word_list)
    for method in methods_list:
        noun_to_noun_sim_matrices[method] = np.add(
            np.zeros((word_list_size, word_list_size), dtype=float), 0.001)

    noun_to_noun_sim_matrices['average_of_methods'] = np.add(
        np.zeros((word_list_size, word_list_size), dtype=float), 0.001)

    brown_ic = wordnet_ic.ic('ic-brown.dat')

    i = 0
    if all_matrix:
        bigger_loop_limit = word_list_size
    else:
        bigger_loop_limit = (word_list_size - 1)

    while i < bigger_loop_limit:

        if all_matrix:
            j = 0
        else:
            j = i + 1

        if full_synsets:
            w1 = wordnet.synset(word_list[i])
        else:
            w1 = wordnet.synsets(word_list[i], word_pos)

            if not w1:
                print('Not able to find this noun: ' + word_list[i])
                unknown_words[word_list[i]] = False
                i += 1
                continue

            w1 = w1[0]

        while j < word_list_size:

            if full_synsets:
                w2 = wordnet.synset(word_list[j])
            else:
                w2 = wordnet.synsets(word_list[j], word_pos)

                if not w2:
                    j += 1
                    continue

                w2 = w2[0]

            if 'wup' in noun_to_noun_sim_matrices:
                value = w1.wup_similarity(w2)
                value = utils.limit_value(value, 0.001, 1.0)
                noun_to_noun_sim_matrices['wup'][i][j] = value

            if 'jcn' in noun_to_noun_sim_matrices:
                value = w1.jcn_similarity(w2, brown_ic)
                value = utils.limit_value(value, 0.001, 1.0, True)
                noun_to_noun_sim_matrices['jcn'][i][j] = value

            if 'lin' in noun_to_noun_sim_matrices:
                value = w1.lin_similarity(w2, brown_ic)
                value = utils.limit_value(value, 0.001, 1.0)
                noun_to_noun_sim_matrices['lin'][i][j] = value

            if 'lch' in noun_to_noun_sim_matrices:
                value = w1.lch_similarity(w2)
                if word_pos == 'n':
                    value = value / 3.6375861597263857
                else:
                    value = value / 3.258096538021482
                value = utils.limit_value(value, 0.001, 1.0)
                noun_to_noun_sim_matrices['lch'][i][j] = value

            value = 0.0
            for method in methods_list:
                value += noun_to_noun_sim_matrices[method][i][j]

            value = value / len(methods_list)

            value = utils.limit_value(value, 0.001, 1.0)
            noun_to_noun_sim_matrices['average_of_methods'][i][j] = value

            j += 1

        i += 1
        print('calculate_sim_matrix_from_list: ' + str(i) + '/' +
              str(word_list_size - 1))

    print('calculate_sim_matrix_from_list ended')

    content_dict['noun_to_noun_sim_matrices'] = noun_to_noun_sim_matrices
    content_dict['unknown_words'] = unknown_words

    return content_dict
예제 #32
0
        return None
    return similarity


if __name__ == '__main__':
    method = sys.argv[1]  # jcn or lch
    corpus = sys.argv[2]  # semcor or brown

    preserve = False  # Preserve SimLex similarities?

    if len(sys.argv) > 3:
        preserve = True

    maxval = 1000.0  # This value will be assigned to extremely high-similarity pairs (like 1e+300)

    ic = wordnet_ic.ic('ic-%s.dat' % corpus)

    for line in sys.stdin:
        if line.strip().startswith('#'):
            continue
        res = line.strip().split('\t')
        (word0, word1, simlex_sim) = res
        simlex_sim = float(simlex_sim)
        synsets0 = reversed(wn.synsets(word0.strip(), 'n'))
        synsets1 = reversed(wn.synsets(word1.strip(), 'n'))
        best_pair = None
        best_sim = 0.0
        for pair in product(synsets0, synsets1):
            if pair[0] == pair[1]:
                continue
            wordnet_sim = calc_similarity(pair, method, ic)
예제 #33
0
import matplotlib.pyplot as plt
import scipy
from nltk.corpus import wordnet as wn, wordnet_ic as wn_ic, lin_thesaurus as lin

brown_ic = wn_ic.ic("ic-brown.dat")


def noun_path_similarity(noun_1, noun_2):
    """
    Returns path similarity of two nouns
    :param noun_1:
    :param noun_2:
    :return: path similarity
    """
    synsets_1 = wn.synsets(noun_1, wn.NOUN)
    synsets_2 = wn.synsets(noun_2, wn.NOUN)
    return round(
        max([
            synset_1.path_similarity(synset_2) for synset_1 in synsets_1
            for synset_2 in synsets_2
        ]), 4)


def noun_similarity(noun_1, noun_2, sim_measure=None):
    """
    Returns similarity between two nouns using the similarity measure defined.
    :param noun_1:
    :param noun_2:
    :param sim_measure: 'path_similarity', 'res_similarity', 'lin_similarity'
    :return: similarity measure
    """
def main():
    " Function to write calling of all the above functions "

    str_dir_path = os.getcwd()

    # Question 1.1 1

    directory = "/home1/c/cis530/hw4/dev_input"
    process_ts_file(directory)

    print "1.1 1 Computed"

    # Question 1.1 2

    topic_file = str_dir_path + "/ts_files/" + "aakritis_dev_00.ts"
    n = 20
    tup_list = load_topic_words(topic_file, n)

    print "1.1 2 Computed"

    # Question 1.2

    # to get list of all .ts file
    ts_directory = str_dir_path + "/ts_files"
    list_all_files = get_all_files(ts_directory)
    n = 20

    text_directory = str_dir_path + "/expanded_topic_words_files"

    if not os.path.exists(text_directory):
        os.makedirs(text_directory)

    # computing information content ( ic )
    brown_ic = wordnet_ic.ic('ic-brown.dat')

    for each_file in list_all_files:

        # run load_topic_words to get key list and candidate list
        tup_list = load_topic_words(each_file, n)  # Question 1.1 2
        keylist = tup_list[0]
        candidatelist = tup_list[1]

        # to extract file name
        list_name = each_file.split("/")
        len_list = len(list_name)
        name_ext = list_name[len_list - 1]

        name_split = name_ext.split(".", 1)
        name = name_split[0] + ".txt"

        outputfile = str_dir_path + "/expanded_topic_words_files/" + name

        expand_keywords(keylist, candidatelist, brown_ic, outputfile)

        print name + " Written"
    print "1.2 Computed"

    # Question 2.1

    directory = "/home1/c/cis530/hw4/dev_input"

    list_sub_dirs = get_list_subdirs(directory)

    sum_directory = str_dir_path + "/summarize_baseline"

    if not os.path.exists(sum_directory):
        os.makedirs(sum_directory)

    for each_subdir in list_sub_dirs:

        # extract dir name from dir path
        list_name = each_subdir.split("/")
        len_list = len(list_name)
        name = "sum_" + list_name[len_list - 1] + ".txt"
        outputfile = str_dir_path + "/summarize_baseline/" + name

        summarize_baseline(each_subdir, outputfile)

        print name + " Written"

    print "2.1 Computed"

    # Question 2.2

    directory = "/home1/c/cis530/hw4/dev_input"

    list_sub_dirs = get_list_subdirs(directory)

    sum_directory = str_dir_path + "/summarize_kl"

    if not os.path.exists(sum_directory):
        os.makedirs(sum_directory)

    for each_subdir in list_sub_dirs:

        # extract dir name from dir path
        list_name = each_subdir.split("/")
        len_list = len(list_name)
        name = "sum_" + list_name[len_list - 1] + ".txt"
        outputfile = str_dir_path + "/summarize_kl/" + name

        summarize_kl(each_subdir, outputfile)

        print name + " Written"

    print "2.2 Computed"

    # Question 2.3

    result_file = str_dir_path + "/results.txt"
    write_rouge_results(result_file)

    print "2.3 Computed"

    return
예제 #35
0
 def __init__(self, ic_corpus='brown'):
     self._ic_corpus = wordnet_ic.ic('ic-brown.dat') if ic_corpus == 'brown' else wordnet_ic.ic('ic-semcor.dat')
     self._wn_max_depth = 19
     self._default_metrics = ['path','lch','wup','li','res','lin','jcn','wpath','zhou']
     self._wn_lemma = WordNetLemmatizer()
예제 #36
0
import sys
import json
import jsonrpclib
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
import pickle
brown_ic = wordnet_ic.ic('ic-brown.dat')
import unitConversion as uc


class aset:
    def __init__(self, num=None, entity=None, surface=None, idx=None):
        self.num = num
        self.entity = entity
        self.surface = surface
        self.idx = idx
        self.widx = (idx % 1000) + 1 if idx is not None else None
        self.container = None
        self.verbs = None
        self.adjs = None
        self.location = None
        self.contains = None
        self.compound = 0
        self.subtypes = []
        self.type_failure = 0

    def details(self, sf=True):

        string = "_____________\n"
        ordrd = sorted(self.__dict__.items())
        for x, y in ordrd:
예제 #37
0
    def compress_isa_graph(self, verbose=True):
        """
        This function is used to compress the extracted graph from WordNet by removing some of the nodes.
        The compression strategy follows paper 'Nearly-Automated Metadata Hierarchy Creation'

        :param verbose: whether to show compression steps for debugging
        :return:
        """
        print("\n\nCompressing WordNet object hierarchy...")

        graph1 = copy.deepcopy(self.graph)

        # Rule 1 - Remove all nodes with low information content
        brown = wnic.ic('ic-brown.dat')
        for node in list(self.graph.nodes()):
            if self.graph.nodes[node]["type"] != "object_id" and self.graph.nodes[node]["type"] != "wordnet_synset":
                if rwn.information_content(wn.synset(node), brown) < 3.0:
                    self.graph.remove_node(node)
        if verbose:
            diff = set(graph1.nodes()) - set(self.graph.nodes())
            print("Nodes removed by compression rule 1: {}".format(list(diff)))

        # Rule 2 - Remove all nodes with only a single child except the root
        if verbose:
            graph2 = copy.deepcopy(self.graph)
        # starting from leaf nodes
        nodes_sort = [node for node in self.graph if len(
            list(self.graph.predecessors(node))) == 0]
        while len(nodes_sort) > 0:
            node = nodes_sort.pop(0)
            if node not in self.graph:
                continue

            parents = list(self.graph.successors(node))
            children = list(self.graph.predecessors(node))
            for parent in parents:
                nodes_sort.append(parent)

            if len(children) == 1 and len(
                    parents) != 0 and self.graph.nodes[node]["type"] != "object_id" and self.graph.nodes[node]["type"] != "wordnet_synset":
                self.graph.remove_node(node)
                for parent in parents:
                    for child in children:
                        self.graph.add_edge(child, parent, relation='IsA')
        if verbose:
            diff = set(graph2.nodes()) - set(self.graph.nodes())
            print("Nodes removed by compression rule 2: {}".format(list(diff)))

        # Rule 3 - Remove all nodes whose name contains the name of the parent
        # (except seed)
        if verbose:
            graph3 = copy.deepcopy(self.graph)
        for node in list(self.graph.nodes()):
            if len(list(self.graph.predecessors(node))) == 0:
                continue
            if self.graph.nodes[node]["type"] == "object_id" or self.graph.nodes[node]["type"] == "wordnet_synset":
                continue
            parents = list(self.graph.successors(node))
            children = list(self.graph.predecessors(node))
            should_remove = True if len(parents) > 0 else False
            for parent in parents:
                pname = parent.split('.')[0]
                cname = node.split('.')[0]
                if pname not in cname:
                    should_remove = False
                    break
            if should_remove:
                self.graph.remove_node(node)
                for child in children:
                    for parent in parents:
                        self.graph.add_edge(child, parent, relation='IsA')
        if verbose:
            diff = set(graph3.nodes()) - set(self.graph.nodes())
            print("Nodes removed by compression rule 3: {}".format(list(diff)))

        # sanity check: make sure no initial object nodes are removed
        current_seeds = []
        for n in list(graph1.nodes()):
            if graph1.nodes[n]["type"] == "wordnet_synset" or graph1.nodes[n]["type"] == "object_id":
                assert n in self.graph.nodes

        # add a common parent to combine the isolated graphs created by
        # compression
        root_nodes = [
            (node,
             "entity.n.01") for node in self.graph if len(
                list(
                    self.graph.successors(node))) == 0]
        self.graph.add_node(
            "entity.n.01",
            color="orange",
            type="extracted_wordnet_synset")
        self.graph.add_edges_from(root_nodes, relation="IsA")
예제 #38
0
def computeInfContSimilarity():
	## Load an information content file from the wordnet_ic corpus
	brown_ic = wordnet_ic.ic('ic-brown.dat')

	print "computing Information Content Similarity..."
	tStart = time.time()
	## Compute the similarity between nouns
	ALLnouns_sim = []
	for subSent1, subSent2 in zip(nouns_text1, nouns_text2):

		## if-else to use the longer sentence
		if (len(subSent1) > len(subSent2)):
			nounSim = np.zeros(len(subSent1)) 
			for i, noun1 in enumerate(subSent1):
				for noun2 in subSent2:
					try:
						w1 = noun1 + ".n.01"
						w1 = wn.synset(w1)
						w2 = noun2 + ".n.01"
						w2 = wn.synset(w2)
						sim = wn.jcn_similarity(w1, w2, brown_ic)
						if sim > nounSim[i]:
							nounSim[i] = sim
					except:
						continue
			# print nounSim
		else:
			nounSim = np.zeros(len(subSent2))
			for i, noun2 in enumerate(subSent2):
				for noun1 in subSent1:
					try:
						w1 = noun1 + ".n.01"
						w1 = wn.synset(w1)
						w2 = noun2 + ".n.01"
						w2 = wn.synset(w2)
						sim = wn.jcn_similarity(w1, w2, brown_ic)
						if sim > nounSim[i]:
							nounSim[i] = sim
					except:
						continue	
		
		ALLnouns_sim.append(nounSim)


	## Compute the similarity between verbs
	ALLverbs_sim = []
	for subSent1, subSent2 in zip(verbs_text1, verbs_text2):

		## if-else to use the longer sentence
		if (len(subSent1) > len(subSent2)):
			verbSim = np.zeros(len(subSent1)) 
			for i, verb1 in enumerate(subSent1):
				for verb2 in subSent2:
					try:
						w1 = verb1 + ".n.01"
						w1 = wn.synset(w1)
						w2 = verb2 + ".n.01"
						w2 = wn.synset(w2)
						sim = wn.jcn_similarity(w1, w2, brown_ic)
						if sim > verbSim[i]:
							verbSim[i] = sim
					except:
						continue
		else:
			verbSim = np.zeros(len(subSent2))
			for i, verb2 in enumerate(subSent2):
				for verb1 in subSent1:
					try:
						w1 = verb1 + ".n.01"
						w1 = wn.synset(w1)
						w2 = verb2 + ".n.01"
						w2 = wn.synset(w2)
						sim = wn.jcn_similarity(w1, w2, brown_ic)
						if sim > verbSim[i]:
							verbSim[i] = sim
					except:
						continue	
		
		ALLverbs_sim.append(verbSim)


	## Compute the similarity between adjectives
	ALLadjs_sim = []
	for subSent1, subSent2 in zip(adj_text1, adj_text2):

		## if-else to use the longer sentence
		if (len(subSent1) > len(subSent2)):
			adjSim = np.zeros(len(subSent1)) 
			for i, adj1 in enumerate(subSent1):
				for adj2 in subSent2:
					try:
						w1 = adj1 + ".n.01"
						w1 = wn.synset(w1)
						w2 = adj2 + ".n.01"
						w2 = wn.synset(w2)
						sim = wn.jcn_similarity(w1, w2, brown_ic)
						if sim > adjSim[i]:
							adjSim[i] = sim
					except:
						continue
			# print nounSim
		else:
			adjSim = np.zeros(len(subSent2))
			for i, adj2 in enumerate(subSent2):
				for adj1 in subSent1:
					try:
						w1 = adj1 + ".n.01"
						w1 = wn.synset(w1)
						w2 = adj2 + ".n.01"
						w2 = wn.synset(w2)
						sim = wn.jcn_similarity(w1, w2, brown_ic)
						if sim > adjSim[i]:
							adjSim[i] = sim
					except:
						continue	
		
		ALLadjs_sim.append(adjSim)

	tEnd = time.time()
	print "..done. Time taken (InformationContentSimilarity): ", tEnd-tStart
	return ALLnouns_sim, ALLverbs_sim, ALLverbs_sim
예제 #39
0
    args = parser.parse_args()
    wn_embedding_fpath = args.model
    threshold = args.threshold
    dataset = args.test_set
    senseval_fpath = '../data/senseval/' + dataset + '/' + dataset + '.data.xml'
    gold_tags_fpath = '../data/senseval/' + dataset + '/' + dataset + '.gold.key.txt'
    AVG_METHOD = args.averaging
    VECTORIZED_SIMILARITY = args.vectorized
    USE_POS_INFO = args.pos
    MAX_DEPTH = args.depth
    USE_RANDOM = args.random

    USE_JCN = True  # if False, lch is used
    USE_PAGERANK = False
    info_content = wordnet_ic.ic('ic-semcor.dat')

    ids, sents, poslist = load_senseval_data(senseval_fpath)
    disambiguated = sentence_wsd(ids, sents, poslist)
    # load the gold results
    with codecs.open(gold_tags_fpath, 'r', 'utf-8') as f:
        lines = f.readlines()
    wsd_output = []
    gold_output = []
    for line in lines:
        id_key_pair = line.split()
        predicted_keys = disambiguated[id_key_pair[0]].split(';')
        gold_keys_set = set(id_key_pair[1:])
        predicted_keys_set = set(predicted_keys)
        if len(predicted_keys_set.intersection(gold_keys_set)) > 0:
            wsd_output.append(predicted_keys[0])
예제 #40
0
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

threshold = 0.6  #treshold for wup
jcnTreshold = 0.09  #jcn
pathTeshold = 0.1  #path
brown_ic = wordnet_ic.ic('ic-brown.dat')  #load the brown corpus
lexical_chains = []  #empty list to hold all the chains
dictionary = {}  #empty dictionart to hold the count of each word encountered


#class Chain
class Chain():
    def __init__(self, words, senses, count=0):
        self.words = set(words)
        self.senses = set(senses)
        dictionary[words[0]] = 1  #initialize counter

    def addWord(self, word):

        if (len(self.words.intersection([word])) > 0):
            dictionary[word] += 1
        else:
            dictionary[word] = 1

        self.words.add(word)

    def addSense(self, sense):
        self.senses.add(sense)
예제 #41
0
파일: __init__.py 프로젝트: clips/pattern
# Make sure the necessary corpora are downloaded to the local drive
for token in ("wordnet", "wordnet_ic", "sentiwordnet"):
    try:
        nltk.data.find("corpora/" + token)
    except LookupError:
        try:
            nltk.download(token, quiet = True, raise_on_error = True)
        except ValueError:
            # Sometimes there are problems with the default index.xml URL. Then we will try this...
            from nltk.downloader import Downloader as NLTKDownloader
            d = NLTKDownloader("http://nltk.github.com/nltk_data/")
            d.download(token, quiet = True, raise_on_error = True)

# Use the Brown corpus for calculating information content (IC)
brown_ic = wn_ic.ic('ic-brown.dat')
IC_CORPUS, IC_MAX = brown_ic, {}
for key in IC_CORPUS:
    IC_MAX[key] = max(IC_CORPUS[key].values())

# This will hold the WordNet version
VERSION = wn.get_version() or "3.0"

#---------------------------------------------------------------------------------------------------

DIACRITICS = {
    "a": ("á", "ä", "â", "à", "å"),
    "e": ("é", "ë", "ê", "è"),
    "i": ("í", "ï", "î", "ì"),
    "o": ("ó", "ö", "ô", "ò", "ō", "ø"),
    "u": ("ú", "ü", "û", "ù", "ů"),
예제 #42
0
                        (probe, c, res_sim[0]))
            # creates a vector of all hypernyms
            probe_sense_vec = wn.synsets(probe)
            # Since these values correspond they have to be the
            # same size
            prob_senses = [0 for i in probe_sense_vec]
            # for each sense of the probe
            for sense in range(len(probe_sense_vec)):
                # look at all the most informative senses
                for mi in range(0, len(mis_vec)):
                    hyper = lambda s: s.hypernyms()
                    ancestors = set(probe_sense_vec[sense].closure(hyper))
                    # if an MIS is an ancestor of the probe
                    if mis_vec[mi] in ancestors:
                        # increment the probability by the MIS val
                        prob_senses[sense] += sim_vec[mi]
            index = returnMaxIndex(prob_senses)
            try:
                #print(probe_sense_vec)
                o.write('%s PREFERRED SENSE: %s\n' %
                        (line_num, probe_sense_vec[index]))
                pass
            except IndexError:
                o.write('ERROR: NO SIMILARITY\n')


brown_ic = wordnet_ic.ic(
    '/home/jake/nltk_data/corpora/wordnet_ic/ic-brown-add1.dat')

wsd(sys.argv[1], sys.argv[2], sys.argv[3])
예제 #43
0
def LexicalChain(fileName="amazon.txt", verbose=0):
    def findWholeWord(w):
        return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

    #class Chain
    class Chain():
        def __init__(self, words, senses, count=0):
            self.words = set(words)
            self.senses = set(senses)
            dictionary[words[0]] = 1  #initialize counter

        def addWord(self, word):

            if (len(self.words.intersection([word])) > 0):
                dictionary[word] += 1
            else:
                dictionary[word] = 1

            self.words.add(word)

        def addSense(self, sense):
            self.senses.add(sense)

        def getWords(self):
            return self.words

        def getSenses(self):
            return self.getSenses

        def incCount(self):
            self.count += 1

        def setScore(self, sc):
            self.score = sc

        def mfword(self):
            maxfreq = 0
            for word in self.getWords():
                if dictionary[word] > maxfreq:
                    maxword = word
                    maxfreq = dictionary[word]
            return maxword

    def add_word(word):
        maximum = 0
        maxJCN = 0
        flag = 0
        for chain in lexical_chains:  #for all chains that are present
            for synset in wn.synsets(word):  #for all synsets of current word
                for sense in chain.senses:  #for all senses of the current word in current element of the current chain
                    similarity = sense.wup_similarity(
                        synset)  #using wup_similarity

                    if (similarity >= maximum):
                        if similarity >= threshold:
                            #print word, synset, sense, sense.jcn_similarity(synset, brown_ic)
                            JCN = sense.jcn_similarity(
                                synset, brown_ic)  #using jcn_similarity
                            if JCN >= jcnTreshold:
                                if sense.path_similarity(
                                        synset) >= 0.2:  #using path similarity
                                    if JCN >= maxJCN:
                                        maximum = similarity
                                        maxJCN = JCN
                                        maxChain = chain
                                        flag = 1
        if flag == 1:
            maxChain.addWord(word)
            maxChain.addSense(synset)
            return

        lexical_chains.append(Chain([word], wn.synsets(word)))

    def count_words(summary):
        count = 0
        for line in summary:
            count = count + len(line.split(' '))
        return count

    #fileName = raw_input("Enter file path + name, if file name is 'nlp.txt', type 'nlp' \n \n")
    #n = raw_input("Enter number of sentences in summary.\n")

    #fileName = "nlp.txt"
    threshold = 0.6  #treshold for wup
    jcnTreshold = 0.09  #jcn
    pathTeshold = 0.1  #path
    brown_ic = wordnet_ic.ic('ic-brown.dat')  #load the brown corpus
    lexical_chains = []  #empty list to hold all the chains
    dictionary = {
    }  #empty dictionart to hold the count of each word encountered
    word_count = 50
    File = open(fileName)  #open file
    lines = File.read()  #read all lines
    #dec_lines =  [line.decode('utf-8') for line in lines]
    #print [clean_line.token for clean_line in clean_lines]

    clean_lines = clean(lines)
    line_list = [clean_line.text for clean_line in clean_lines]
    is_noun = lambda x: True if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or
                                 pos == 'NNPS') else False
    nouns = [
        word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(lines))
        if is_noun(pos)
    ]  #extract all nouns

    for word in nouns:
        add_word(word)

    #print all chains
    for chain in lexical_chains:
        chain_length = 0
        dis_word = 0
        for word in chain.getWords():
            #print str(word + "(" + str(dictionary[word]) + ")") + ',',
            chain_length = chain_length + dictionary[word]
            dis_word = dis_word + 1
        #print 'Length =' + str(chain_length)
        hom = 1 - (dis_word * 1.0 / chain_length)
        #print 'Homogeneity =' + str(hom)
        score = 1.0 * chain_length * hom
        #print 'Score =' + str(score)
        chain.setScore(score)

    #print 'Sorted start '
    lexical_chains.sort(key=lambda x: x.score, reverse=True)
    verbose = 1
    if verbose == 1:
        for chain in lexical_chains:
            if (chain.score > 0.0):
                for word in chain.getWords():
                    print str(word + "(" + str(dictionary[word]) + ")") + ',',
                print 'Score=' + str(chain.score)

    summary = []
    line_flags = []
    line_score = []

    for line in line_list:
        line_flags.append(0)
        line_score.append(0)

    for chain in lexical_chains:

        bigword = chain.mfword()
        chain_score = chain.score
        #print '\nMF word ', bigword
        for i in range(len(line_list)):
            line = line_list[i]
            try:
                x = findWholeWord(bigword)(line)
            except:
                #print 'Exception : Error in finding word'
                x = None
            if x != None:
                #((line.find(' '+str(bigword)+' ')!=-1) or (line.find(' '+str(bigword)+'.')!=-1)):
                if line_flags[i] == 0:
                    #summary.append(line)
                    #print 'i  ', count_words(summary)
                    line_flags[i] = 1
                    line_score[i] = chain_score
                    #print 'line_score ', line_score
                    #print 'line_flags ', line_flags

                    break
                #elif line_flags[i]==1:
                #line_score[i] = line_score[i] + chain.score
                #print '\nline_score ', line_score
                #print 'line_flags ', line_flags
    '''
		if(count_words(summary)>word_count):
			break			

	'''

    bias = 20
    tot_score = 0
    for i in range(len(line_score)):
        line_score[i] = (line_score[i] * bias) + 1

    for score in line_score:
        tot_score = tot_score + score

    for i in range(len(line_score)):
        line_score[i] = (line_score[i] / tot_score)

    print line_score

    namscores = dict(
        zip([sentence.token for sentence in clean_lines], line_score))

    #print namscores
    #print len(summary)
    #print line_score

    #final_summary = ' '.join(summary)
    #print final_summary
    return namscores


#print LexicalChain(verbose=1)
예제 #44
0
    def calculate_sim_matrix(self):

        print('calculate_sim_matrix started')

        self.noun_to_noun_sim_matrices.append(
            np.add(
                np.zeros((self.noun_rows_size, self.noun_rows_size),
                         dtype=float), 0.01))
        self.noun_to_noun_sim_matrices.append(
            np.add(
                np.zeros((self.noun_rows_size, self.noun_rows_size),
                         dtype=float), 0.01))
        self.noun_to_noun_sim_matrices.append(
            np.add(
                np.zeros((self.noun_rows_size, self.noun_rows_size),
                         dtype=float), 0.01))
        self.noun_to_noun_sim_matrices.append(
            np.add(
                np.zeros((self.noun_rows_size, self.noun_rows_size),
                         dtype=float), 0.01))
        self.noun_to_noun_sim_matrices.append(
            np.add(
                np.zeros((self.noun_rows_size, self.noun_rows_size),
                         dtype=float), 0.01))

        inverted_noun_dict = utils.invert_dictionary(self.noun_rows)

        brown_ic = wordnet_ic.ic('ic-brown.dat')

        for key in inverted_noun_dict:
            print(str(key) + ': ' + inverted_noun_dict[key])

        i = 0
        while i < (self.noun_rows_size - 1):
            j = i + 1
            w1 = wordnet.synsets(inverted_noun_dict[i], pos=wordnet.NOUN)
            if not w1:
                print('Not able to find this noun: ' + inverted_noun_dict[i])
                i += 1
                continue

            w1 = w1[0]

            while j < self.noun_rows_size:
                w2 = wordnet.synsets(inverted_noun_dict[j], pos=wordnet.NOUN)
                if not w2:
                    j += 1
                    continue

                w2 = w2[0]

                value = w1.wup_similarity(w2)
                value = utils.limit_value(value, 0.01, 1.0)
                self.noun_to_noun_sim_matrices[0][i][j] = value

                value = w1.lch_similarity(w2) / lch_maximum_obtained_value
                value = utils.limit_value(value, 0.01, 1.0)
                self.noun_to_noun_sim_matrices[1][i][j] = value

                value = w1.jcn_similarity(w2, brown_ic)
                value = utils.limit_value(value, 0.01, 1.0, True)
                self.noun_to_noun_sim_matrices[2][i][j] = value

                value = w1.lin_similarity(w2, brown_ic)
                value = utils.limit_value(value, 0.01, 1.0)
                self.noun_to_noun_sim_matrices[3][i][j] = value

                value = (self.noun_to_noun_sim_matrices[0][i][j] +
                         self.noun_to_noun_sim_matrices[1][i][j] +
                         self.noun_to_noun_sim_matrices[2][i][j] +
                         self.noun_to_noun_sim_matrices[3][i][j]) / 4.0

                value = utils.limit_value(value, 0.01, 1.0)

                self.noun_to_noun_sim_matrices[4][i][j] = value

                j += 1

            print('sim_matrix: ' + str(i) + '\n')
            i += 1

        print('calculate_sim_matrix ended')
예제 #45
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

comb1 = "LOGA-IGFF-COSN"
comb2 = "FREQ-IDFB-COSN"

minRatio = 0.25

semcor_ic = wordnet_ic.ic('ic-semcor.dat')
'''
Given the similarities between several pairs of words, it selects the highest similarity values for all the words
corresponding to the dataset that has less relevant words

After that, it assign a percentage of importance depending if the similarity is total or not, and based on the
similarities and the percentages, it calculates a unique final value
'''


def processSimilarities(similarities):

    values = []
    valuesEqualsToOne = 0

    keys1 = len(similarities.keys())
    if keys1 > 0:
        keys2 = len(similarities[list(similarities.keys())[0]].keys())
    else:
        keys2 = 0
 def __init__(self, ic_path: str = "ic-brown.dat"):
     self.brown_ic = wordnet_ic.ic(ic_path)
def similarityWordNet(word1, word2):
    """
    Similarity
    Similarity between two words with nltk
    Input: word1, word2 (String)
    Return: similarity (float)
    """
    #print Word(word1).lemmatize()
    #print Word(word2).lemmatize()
    word1 = wn.synset(str(word1) + '.n.01')
    word2 = wn.synset(str(word2) + '.n.01')
    """
    Return a score denoting how similar two word senses are,
    based on the shortest path that connects the senses in the is-a
    (hypernym/hypnoym) taxonomy. The score is in the range 0 to 1.
    
    By default, there is now a fake root node added to verbs so
    for cases where previously a path could not be found---and None
    was returned---it should return a value. The old behavior can be
    achieved by setting simulate_root to be False. A score of 1 represents
    identity i.e. comparing a sense with itself will return 1.
    """
    #similarity1 = word1.path_similarity(word2)
    similarity1 = wn.path_similarity(word1, word2)
    """
    Leacock-Chodorow Similarity: Return a score denoting how similar
    two word senses are, based on the shortest path that connects
    the senses (as above) and the maximum depth of the taxonomy in
    which the senses occur. range 3.6
    
    The relationship is given as -log(p/2d) where p is the
    shortest path length and d the taxonomy depth.
    """
    similarity2 = wn.lch_similarity(word1, word2)
    """
    Wu-Palmer Similarity: Return a score denoting how similar
    two word senses are, based on the depth of the two senses in
    the taxonomy and that of their Least Common Subsumer (most specific ancestor node).
    range 0.92
    
    Note that at this time the scores given do _not_ always agree with those given by Pedersen's
    Perl implementation of Wordnet Similarity.
    The LCS does not necessarily feature in the shortest path connecting the two senses,
    as it is by definition the common ancestor deepest in the taxonomy, not closest to
    the two senses. Typically, however, it will so feature. Where multiple candidates for
    the LCS exist, that whose shortest path to the root node is the longest will be selected.
    Where the LCS has multiple paths to the root, the longer path is used for the purposes
    of the calculation.
    """
    similarity3 = wn.wup_similarity(word1, word2)
    """
    Resnik Similarity: Return a score denoting how similar two word senses are, based on the
    Information Content (IC) of the Least Common Subsumer (most specific ancestor node).
    Note that for any similarity measure that uses information content, the result is dependent on
    the corpus used to generate the information content and the specifics of how the information
    content was created. 0-8.43
    """
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    similarity4 = word1.res_similarity(word2, brown_ic)

    print("similarity1: ", similarity1)
    print("similarity2 Leacock-Chodorow: ", similarity2)
    print("similarity3 Wu-Palmer: ", similarity3)
    print("similarity4 Resnik: ", similarity4)
예제 #48
0
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

# algorithm parameters
USE_POS_INFO = True
USE_JCN = True  # if False, lch is used
VECTORIZED_SIMILARITY = True
USE_PAGERANK = False
AVG_METHOD = 'micro'
MAX_DEPTH = 3
senseval_fpath = 'data/senseval/senseval2/senseval2.data.xml'
gold_tags_fpath = 'data/senseval/senseval2/senseval2.gold.key.txt'
wn_embedding_fpath = sys.argv[1]

info_content = wordnet_ic.ic('ic-brown.dat')


def load_fse(path):
    model = {}
    for f_line in gensim.utils.smart_open(path):
        f_line = gensim.utils.to_unicode(f_line)
        res = f_line.strip().split('\t')
        (synset, vector) = res
        model[synset] = vector
    return model


def hamming_distance(pair, vec_dic):
    s0 = vec_dic[pair[0]]
    s1 = vec_dic[pair[1]]
if __name__ == "__main__":

    if (len(sys.argv) >= 2):
        information_content_file_type = sys.argv[1]
        wsd_test_filename = sys.argv[2]
        judgment_file = sys.argv[3]
        output_filename = sys.argv[4]
    else:
        print("Incorrect number of arguments")

    start = time.clock()

    #select the right ic file based on the input parameters
    if information_content_file_type == "nltk":
        wnic = wordnet_ic.ic('ic-brown-resnik-add1.dat')
    else:
        wnic = create_ic(wsd_test_filename, judgment_file, 'hw8_myic.txt')

    with open(output_filename, 'w') as op_file:

        #creating a list to store the answers obtained by the algorithm
        wsd_answers_obtained = []

        with open(wsd_test_filename, 'r') as wsd_file:

            for line in wsd_file:
                line = line.strip('\n')
                line = line.split('\t')
                probe_word = line[0]  #extract probe word
                noun_groups = line[1].split(',')  #estract noun groups
예제 #50
0
파일: __init__.py 프로젝트: jhpyle/pattern
# Make sure the necessary corpora are downloaded to the local drive
for token in ("wordnet", "wordnet_ic", "sentiwordnet"):
    try:
        nltk.data.find("corpora/" + token)
    except LookupError:
        try:
            nltk.download(token, quiet = True, raise_on_error = True)
        except ValueError:
            # Sometimes there are problems with the default index.xml URL. Then we will try this...
            from nltk.downloader import Downloader as NLTKDownloader
            d = NLTKDownloader("http://nltk.github.com/nltk_data/")
            d.download(token, quiet = True, raise_on_error = True)

# Use the Brown corpus for calculating information content (IC)
brown_ic = wn_ic.ic('ic-brown.dat')
IC_CORPUS, IC_MAX = brown_ic, {}
for key in IC_CORPUS:
    IC_MAX[key] = max(IC_CORPUS[key].values())

# This will hold the WordNet version
VERSION = wn.get_version() or "3.0"

#---------------------------------------------------------------------------------------------------

DIACRITICS = {
    "a": ("á", "ä", "â", "à", "å"),
    "e": ("é", "ë", "ê", "è"),
    "i": ("í", "ï", "î", "ì"),
    "o": ("ó", "ö", "ô", "ò", "ō", "ø"),
    "u": ("ú", "ü", "û", "ù", "ů"),
class WordnetSimilarityEvaluator:
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    def __init__(self ):
        # self.fn_docs = prior_case_directory
        # self.file_contents = dict()
        self.preprocessor = Preprocessor()
        # self.populate_file_contents()

    # def populate_file_contents(self):
    #     # self.file_contents[self.fn_docs] = dict()
    #     for file in sorted(os.listdir(self.fn_docs),
    #                        key=lambda item: (int(item.partition('_')[2])
    #                             if item[0].isdigit() else float('inf'), item)):
    #         filename = os.fsdecode(file)
    #         if filename.endswith(".txt"):
    #             # print(os.path.join(directory), str(filename))
    #             with open(os.path.join(self.fn_docs, str(filename)), 'r') as f:
    #                 content = self.preprocessor.preprocess(f.read().lower())
    #                 self.file_contents[filename] = self.doc_to_synsets(content)

    def convert_tag(self,tag):
        """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""

        tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
        try:
            return tag_dict[tag[0]]
        except KeyError:
            return None


    def doc_to_synsets(self,doc):
        """
        Returns a list of synsets in document.

        Tokenizes and tags the words in the document doc.
        Then finds the first synset for each word/tag combination.
        If a synset is not found for that combination it is skipped.

        Args:
            doc: string to be converted

        Returns:
            list of synsets

        Example:
            doc_to_synsets('Fish are nvqjp friends.')
            Out: [Synset('fish.n.01'), Synset('be.v.01'), Synset('friend.n.01')]
        """

        # Your Code Here
        token = nltk.word_tokenize(doc)
        # add parts of speech to token
        tag = nltk.pos_tag(token)
        # convert nltk pos into wordnet pos
        nltk2wordnet = [(i[0], self.convert_tag(i[1])) for i in tag]
        # if there are no synsets in token, ignore, else put in a list
        output = [wn.synsets(i, z)[0] for i, z in nltk2wordnet if len(wn.synsets(i, z))>0]

        return output


    def similarity_score(self,s1, s2):
        """
        Calculate the normalized similarity score of s1 onto s2

        For each synset in s1, finds the synset in s2 with the largest similarity value.
        Sum of all of the largest similarity values and normalize this value by dividing it by the
        number of largest similarity values found.

        Args:
            s1, s2: list of synsets from doc_to_synsets

        Returns:
            normalized similarity score of s1 onto s2

        Example:
            synsets1 = doc_to_synsets('I like cats')
            synsets2 = doc_to_synsets('I like dogs')
            similarity_score(synsets1, synsets2)
            Out: 0.73333333333333339
        """


        # Your Code Here
        list1 = []
        # For each synset in s1
        for a in s1:
            # finds the synset in s2 with the largest similarity value
            # l = [i.jcn_similarity(a, brown_ic) for i in s2 if i.pos() == a.pos() and
            #      i.jcn_similarity(a, brown_ic) is not None]
            # l = [i.path_similarity(a) for i in s2 if i.path_similarity(a) is not None]
            # l = [wn.jcn_similarity(i, a, self.brown_ic) for i in s2 if i.pos() == a.pos() and
            #      i.pos() in self.brown_ic.keys() and a.pos() in self.brown_ic.keys() and
            #      wn.jcn_similarity(i, a, self.brown_ic) is not None ]

            # Path similarity
            l = [i.path_similarity(a) for i in s2 if i.path_similarity(a) is not None]

            # Wu Palmer Similarity
            # l = [i.wup_similarity(a) for i in s2 if i.wup_similarity(a) is not None]

            # Leacock-Chodorow Similarity
            # l = [i.lch_similarity(a) for i in s2 if i.pos() == a.pos() and i.lch_similarity(a) is not None]
            if len(l) > 0:
                list1.append(max(l))

        if len(list1) > 0:
            output = sum(list1)/len(list1)
        else:
            output = 0

        return output


    def document_path_similarity(self,doc1, doc2):
        """Finds the symmetrical similarity between doc1 and doc2"""
                # first function u need to create
        synsets1 = self.doc_to_synsets(doc1)
        synsets2 = self.doc_to_synsets(doc2)
                # 2nd function u need to create


        return self.sysnset_path_similarity(synsets1, synsets2)


    def sysnset_path_similarity(self, synsets1, synsets2):
        # return self.similarity_score(synsets1, synsets2)
        return (self.similarity_score(synsets1, synsets2) + self.similarity_score(synsets2, synsets1)) / 2
import collections
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import prepare_results
import heapq

lemmatizer = WordNetLemmatizer()
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')


def generate_test_dictionary():
    with open('data.txt', 'r', encoding='utf-8') as f:
        lines = [line for line in f.read().splitlines() if line]
        lines_dict = collections.defaultdict()
        for line in lines:
            string1 = str(line).replace('(', '{', 1).replace(',', ':', 1)
            string2 = string1[::-1].replace(')', '}', 1)
            string3 = string2[::-1]
            dictionary = eval(string3)
            lines_dict.update(dictionary)

    return lines_dict


def lemmatize_translations(dictionary):
    """Normalizes translations by lemmatizing and rendering in lower case."""
    lemmatizer = WordNetLemmatizer()
예제 #53
0
 def __init__(self):
     self.scaler = StandardScaler()
     self.brown_ic = wordnet_ic.ic('ic-brown.dat')
                continue;
            sent.append(define)  
            continue;
        else:
            sent.append(word)
    
    sentence=' '.join(sent)
    return sentence

replace_word_with_def(text,tags)


# import corpuses for similarity measures
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')

#http://www.nltk.org/howto/wordnet.html
# similarity based on path
def jcn_sim(word1,word2):

    word1=str(word1)
    word2=str(word2)
    try:    
        w1=wordnet.synset(wordnet.synsets(word1)[0].name())
    except:
        return np.nan
    try:    
        w2=wordnet.synset(wordnet.synsets(word2)[0].name())
    except:
예제 #55
0
    similarities = [
        similarity_function(ss1, ss2) for ss1 in synsets1 for ss2 in synsets2
    ]
    return max(similarities) if len(similarities) != 0 else .0


def path_similarity(synsets1, synsets2):
    return __max_similarity(synsets1, synsets2, wn.path_similarity)


def lch_similarity(synsets1, synsets2):
    return __max_similarity(synsets1, synsets2, wn.lch_similarity)


from nltk.corpus import wordnet_ic
corpus = wordnet_ic.ic('ic-brown.dat')


def lin_similarity(synsets1, synsets2):
    similarity_function = lambda ss1, ss2: wn.lin_similarity(ss1, ss2, corpus)
    return __max_similarity(synsets1, synsets2, similarity_function)


def jcn_similarity(synsets1, synsets2):
    similarity_function = lambda ss1, ss2: wn.jcn_similarity(ss1, ss2, corpus)
    return __max_similarity(synsets1, synsets2, similarity_function)


def res_similarity(synsets1, synsets2):
    similarity_function = lambda ss1, ss2: wn.res_similarity(ss1, ss2, corpus)
    return __max_similarity(synsets1, synsets2, similarity_function)
예제 #56
0
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
 
threshold = 0.6 #treshold for wup
jcnTreshold = 0.09 #jcn
pathTeshold = 0.1 #path
brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus
lexical_chains = [] #empty list to hold all the chains
dictionary = {} #empty dictionart to hold the count of each word encountered

#class Chain 
class Chain(): 
    def __init__(self, words, senses, count = 0):
	self.words = set(words)
	self.senses = set(senses)
	dictionary[words[0]] = 1 #initialize counter
	
    def addWord(self, word):
        
        if(len(self.words.intersection([word])) > 0):
            dictionary[word] += 1
        else:
            dictionary[word] = 1
        
        self.words.add(word)
	
	

    def addSense(self, sense):
	self.senses.add(sense)
예제 #57
0
def bind_kernel(
        features=None,  # Must be provided if syntax_feature_types is True
        syntax_feature_types=['baseline', 'dependency', 'hand_picked'],
        semantic_similarity='res',
        include_suffix=True,
        syntactic_multiplier=0.33,
        semantic_multiplier=0.33,
        suffix_multiplier=0.33):
    '''
    Returns a kernel function that has a given dictionary and features
    lookup bound to its scope.
    '''

    # Validate that a sensible value for semantic similarity was provided
    semantic_similarity_is_valid = (semantic_similarity in LEGAL_SIMILARITIES
                                    or semantic_similarity is None)
    if not semantic_similarity_is_valid:
        raise ValueError('semantic_similarity must be one of the following: ' +
                         ', '.join(LEGAL_SIMILARITIES) +
                         '.  Got %s.' % repr(semantic_similarity))

    # Validate that a sensible value for syntactic similarity was provided
    syntactic_similarity_is_valid = syntax_feature_types is None or all(
        feature_type in LEGAL_SYNTACTIC_SIMILARITIES
        for feature_type in syntax_feature_types)
    if not syntactic_similarity_is_valid:
        raise ValueError(
            'syntax_feature_types must be a list with any of the following: ' +
            ', '.join(LEGAL_SYNTACTIC_SIMILARITIES) +
            '.  Got %s.' % repr(syntax_feature_types))

    # Semantic similarity functions need an "information content" file
    # to calculate similarity values.
    if semantic_similarity is not None:
        information_content = wordnet_ic.ic(INFORMATION_CONTENT_FILE)

    def kernel(A, B):
        '''
        Custom kernel function.  This counts how often the links incident on
        two different words within their respective dependency trees are the 
        same, up to the dependency relation and the POS of the neighbour.

        Note that A references a set of words' dependency trees, and B
        references another set.  So that this function end up making
        len(A) * len(B) of such comparisons, and return the result as a 
        len(A) by len(B) matrix.
        '''

        result = []
        for a in A:

            token_a = u.ensure_unicode(features.get_token(int(a[0])))

            # Get token_a's dependency tree features
            if syntax_feature_types is not None:
                syntax_features_a = features.get_features_idx(
                    int(a[0]), syntax_feature_types)

            # Get the token_a's synset if semantic similarity is being used
            if semantic_similarity is not None:
                semantic_features_a = nouns_only(wordnet.synsets(token_a))

            if include_suffix:
                suffix_a = features.get_suffix(token_a)

            result_row = []
            result.append(result_row)
            for b in B:

                kernel_score = 0
                token_b = u.ensure_unicode(features.get_token(int(b[0])))

                # Calculate the dependency tree kernel
                if syntax_feature_types is not None:
                    syntax_features_b = features.get_features_idx(
                        int(b[0]), syntax_feature_types)
                    kernel_score += syntactic_multiplier * dict_dot(
                        syntax_features_a, syntax_features_b)

                # Calculate semantic similarity is being used
                if semantic_similarity is not None:
                    semantic_features_b = nouns_only(wordnet.synsets(token_b))
                    kernel_score += semantic_multiplier * max_similarity(
                        semantic_similarity, semantic_features_a,
                        semantic_features_b, information_content)

                # Determine if suffixes match
                if include_suffix:
                    suffix_b = features.get_suffix(token_b)
                    if suffix_a is not None and suffix_a == suffix_b:
                        kernel_score += suffix_multiplier

                result_row.append(kernel_score)

        return result

    return kernel
예제 #58
0
import sys

from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
import unitConversion as uc

import utils

FOLD = None
with open("names.txt") as f:
    NAMES = [x.strip() for x in f.readlines()]


class aset:

    def __init__(self, num=None, entity=None, surface=None, idx=None):
        self.num = num
        self.entity = entity
        self.surface = surface
        self.idx = idx
        self.widx = (idx % 1000) + 1 if idx is not None else None
        self.container = None
        self.verbs = None
        self.adjs = None
        self.location = None
        self.contains = None
        self.compound = 0
        self.subtypes = []
        self.type_failure = 0
        self.origs = idx // 1001 if idx is not None else None
예제 #59
0
from nltk.tag import StanfordPOSTagger
from nltk.corpus import stopwords
from gensim import corpora, models, similarities
from collections import defaultdict
from nltk.stem.porter import PorterStemmer
import re

import itertools
import codecs
import pprint as pp
import operator
from collections import OrderedDict
from collections import Counter
import json

brown_ic = wordnet_ic.ic('ic-brown.dat')
ic_bnc_plus1 = wordnet_ic.ic('ic-bnc-add1.dat')
NERModelPath = "C:/StanfordNER/nlp/models/ner/"
NERModel = "english.conll.4class.caseless.distsim.crf.ser.gz"
    # NOTE : the 4 classes are Person, Location, Organization, Misc
NER = StanfordNERTagger(NERModelPath + NERModel)

# FOR POS Tagger:
POSJar = "C:/StanfordPOS/stanford-postagger.jar"
POSTaggerPath = "C:/StanfordPOS/models/"
POSTagger = 'english-bidirectional-distsim.tagger'
POSModel = POSTaggerPath+POSTagger
st = StanfordPOSTagger(POSModel, POSJar)

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
예제 #60
-1
def similarity_by_infocontent(sense1, sense2, option):
    """ Returns similarity scores by information content. """
    if sense1.pos != sense2.pos: # infocontent sim can't do diff POS.
        return 0

    info_contents = ['ic-bnc-add1.dat', 'ic-bnc-resnik-add1.dat', 
                     'ic-bnc-resnik.dat', 'ic-bnc.dat', 
                     
                     'ic-brown-add1.dat', 'ic-brown-resnik-add1.dat', 
                     'ic-brown-resnik.dat', 'ic-brown.dat', 
                     
                     'ic-semcor-add1.dat', 'ic-semcor.dat',
                      
                     'ic-semcorraw-add1.dat', 'ic-semcorraw-resnik-add1.dat', 
                     'ic-semcorraw-resnik.dat', 'ic-semcorraw.dat', 
                     
                     'ic-shaks-add1.dat', 'ic-shaks-resnik.dat', 
                     'ic-shaks-resnink-add1.dat', 'ic-shaks.dat', 
                     
                     'ic-treebank-add1.dat', 'ic-treebank-resnik-add1.dat', 
                     'ic-treebank-resnik.dat', 'ic-treebank.dat']
  
    if option in ['res', 'resnik']:
        return wn.res_similarity(sense1, sense2, wnic.ic('ic-bnc-resnik-add1.dat'))
    #return min(wn.res_similarity(sense1, sense2, wnic.ic(ic)) \
    #             for ic in info_contents)

    elif option in ['jcn', "jiang-conrath"]:
        return wn.jcn_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
  
    elif option in ['lin']:
        return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))