Пример #1
0
 def __init__(self):
     '''データのロード'''
     root = nltk.data.find('corpora/wordnet')
     cd = os.path.dirname(__file__)
     if cd == "":
         cd = "."
     filename = cd + '/wnjpn-ok.tab'
     WordNetCorpusReader.__init__(self, root)
     with codecs.open(filename, encoding="utf-8") as f:
         self._jword2offset = {}
         counter = 0
         for line in f:
             try:
                 _cells = line.strip().split('\t')
                 _offset_pos = _cells[0]
                 _word = _cells[1]
                 if len(_cells) > 2: _tag = _cells[2]
                 _offset, _pos = _offset_pos.split('-')
                 self._jword2offset[_word] = {
                     'offset': int(_offset),
                     'pos': _pos
                 }
                 counter += 1
             except:
                 print("failed to lead line %d" % counter)
Пример #2
0
 def __init__(self, root, filename):
     WordNetCorpusReader.__init__(self, root)
     import codecs
     f=codecs.open(filename, encoding="utf-8")
     self._jword2offset = {}
     for line in f:
         _cells = line.strip().split('\t')
         _offset_pos = _cells[0]
         _word = _cells[1]
         if len(_cells)>2: _tag = _cells[2]
         _offset, _pos = _offset_pos.split('-')
         try:
           self._jword2offset[_word].append({'offset': int(_offset), 'pos': _pos})
         except:
           self._jword2offset[_word]=[{'offset': int(_offset), 'pos': _pos}]
Пример #3
0
 def synset(self, word):
     "synsetの取得"
     if word in self._jword2offset:
         return WordNetCorpusReader._synset_from_pos_and_offset(
             self, self._jword2offset[word]['pos'], self._jword2offset[word]['offset']
         )
     else:
         return None
Пример #4
0
 def synset(self, word):
     '''synsetの取得'''
     if word in self._jword2offset:
         return WordNetCorpusReader._synset_from_pos_and_offset(
             self, self._jword2offset[word]['pos'],
             self._jword2offset[word]['offset'])
     else:
         return None
Пример #5
0
 def synsets(self, word):
     if word in self._jword2offset:
         results = []
         for offset in (self._jword2offset[word]):
             results.append(WordNetCorpusReader._synset_from_pos_and_offset(
             self, offset['pos'], offset['offset']
             ))
         return results
     else:
         return None
Пример #6
0
def load_wn(version="3.0", location="../../data/wordnet/", base="wn"):
    """
    I kept forgetting how to load WordNet, and this makes it easier to handle
    different versions of wordnet.  Assumes that in the nltk_data directory a
    directory called "alt_wordnets" exists, and the dict directory of every
    version is named "base-0.0" (e.g. "wn-1.6") inside that directory.

    Returns an initialized wn reader.  Defaults to the normal installation if
    it can't find the WN you're looking for (pay attention to the error
    message if that happens, as you might not be using the version you
    thought).
    """
    path = location + "%s-%s" % (base, version)
    print "Looking for ", path
    if os.path.exists(path):
        return WordNetCorpusReader(path)
    else:
        print("Failed to find WN - defaulting to NLTK's version")
        return WordNetCorpusReader(nltk.data.find("corpora/wordnet"))
Пример #7
0
 def synsets(self, word):
     #        results = [[ ], [ ]]
     if word in self._jword2offset:
         results = []
         for offset in (self._jword2offset[word]):
             results.append(
                 WordNetCorpusReader._synset_from_pos_and_offset(
                     self, offset['pos'], offset['offset']))
         return results
     else:
         return []
Пример #8
0
 def __init__(self, root, filename):
     WordNetCorpusReader.__init__(self, root)
     import codecs
     f = codecs.open(filename, encoding="utf-8")
     self._jword2offset = {}
     for line in f:
         _cells = line.strip().split('\t')
         _offset_pos = _cells[0]
         _word = _cells[1]
         if len(_cells) > 2: _tag = _cells[2]
         _offset, _pos = _offset_pos.split('-')
         try:
             self._jword2offset[_word].append({
                 'offset': int(_offset),
                 'pos': _pos
             })
         except:
             self._jword2offset[_word] = [{
                 'offset': int(_offset),
                 'pos': _pos
             }]
Пример #9
0
class WordNetLookup(object):
    def __init__(self, path='corpora/wordnet'):
        self.path = path
        self.WN = None

    def wn(self):
        if not self.WN:
            self.WN = WordNetCorpusReader(nltk.data.find(self.path))

    def is_superclass_of(self, first, second):
        "Is the second noun the superclass of the first one?"
        self.wn()
        # We cannot guarantee it is a noun. By the time we deal with DRSs, this is just a condition, and could have easily
        # come from an adjective (if the user does not provide features for nouns, as we do in our grammar)
        try:
            num_of_senses_first = self._num_of_senses(first)
            num_of_senses_second = self._num_of_senses(second)
        except:
            return False
        # At first I wanted to take the first senses of both words, but the first sense is not always the basic meaning of the word, e.g.:
        # S('hammer.n.1').definition: the part of a gunlock that strikes the percussion cap when the trigger is pulled'
        # S('hammer.n.2').definition: 'a hand tool with a heavy rigid head and a handle; used to deliver an impulsive force by striking'
        for n in range(num_of_senses_second):
            synset_second = self._noun_synset(second, ind=n)
            for i in range(num_of_senses_first):
                #print synset_second, self._noun_synset(first, i).common_hypernyms(synset_second)
                if synset_second in self._noun_synset(
                        first, i).common_hypernyms(synset_second):
                    #print "+++ first", first, "second", second, True
                    return True
        return False

    def is_adjective(self, word):
        try:
            self._num_of_senses(word, 'a')
            return True
        except:
            return False

    def _noun_synset(self, noun, ind):
        self.wn()
        return self.WN.synset("%s.n.%s" % (noun, ind))

    def _num_of_senses(self, word, pos='n'):
        self.wn()
        return len(self.WN._lemma_pos_offset_map[word][pos])

    def is_person(self, word):
        return self.is_superclass_of(word, 'person')

    def is_animal(self, word):
        return self.is_superclass_of(word, 'animal')
Пример #10
0
 def __init__(self):
     "データのロード"
     root = nltk.data.find('corpora/wordnet')
     cd = os.path.dirname(__file__)
     if cd == "":
         cd = "."
     filename = cd+'/wnjpn-ok.tab'
     WordNetCorpusReader.__init__(self, root)
     import codecs
     with codecs.open(filename, encoding="utf-8") as f:
         self._jword2offset = {}
         counter = 0
         for line in f:
             try:
                 _cells = line.strip().split('\t')
                 _offset_pos = _cells[0]
                 _word = _cells[1]
                 if len(_cells)>2: _tag = _cells[2]
                 _offset, _pos = _offset_pos.split('-')
                 self._jword2offset[_word] = {'offset': int(_offset), 'pos': _pos}
                 counter += 1
             except:
                 print "failed to lead line %d" % counter
Пример #11
0
class WordNetLookup(object):
    def __init__(self, path='corpora/wordnet'):
        self.path = path
        self.WN = None
    
    def wn(self):
        if not self.WN:
            self.WN = WordNetCorpusReader(nltk.data.find(self.path))
                    
    def is_superclass_of(self, first, second):
        "Is the second noun the superclass of the first one?"
        self.wn()
        # We cannot guarantee it is a noun. By the time we deal with DRSs, this is just a condition, and could have easily
        # come from an adjective (if the user does not provide features for nouns, as we do in our grammar)
        try:
            num_of_senses_first = self._num_of_senses(first)
            num_of_senses_second = self._num_of_senses(second)
        except: return False
        # At first I wanted to take the first senses of both words, but the first sense is not always the basic meaning of the word, e.g.:
        # S('hammer.n.1').definition: the part of a gunlock that strikes the percussion cap when the trigger is pulled'
        # S('hammer.n.2').definition: 'a hand tool with a heavy rigid head and a handle; used to deliver an impulsive force by striking'
        for n in range(num_of_senses_second):
            synset_second = self._noun_synset(second, ind=n)
            for i in range(num_of_senses_first):
                #print synset_second, self._noun_synset(first, i).common_hypernyms(synset_second)
                if synset_second in self._noun_synset(first, i).common_hypernyms(synset_second):
                    #print "+++ first", first, "second", second, True
                    return True
        return False
                
    def is_adjective(self, word):
        try: 
            self._num_of_senses(word, 'a')
            return True
        except: return False
    
    def _noun_synset(self, noun, ind):
        self.wn()
        return self.WN.synset("%s.n.%s" % (noun, ind))
    
    def _num_of_senses (self, word, pos='n'):
        self.wn()
        return len(self.WN._lemma_pos_offset_map[word][pos])
    
    def is_person(self, word):
        return self.is_superclass_of(word, 'person')
    
    def is_animal(self, word):
        return self.is_superclass_of(word, 'animal')
Пример #12
0
class WordNetLookup(object):
    def __init__(self, path='corpora/wordnet'):
        self.path = path
        self.WN = None

    def wn(self):
        if not self.WN:
            self.WN = WordNetCorpusReader(nltk.data.find(self.path))

    def is_superclass_of(self, first, second):
        "Is the second noun the superclass of the first one?"
        self.wn()
        try:
            num_of_senses_first = self._num_of_senses(first)
            num_of_senses_second = self._num_of_senses(second)
        except:
            return False
        for n in range(num_of_senses_second):
            synset_second = self._noun_synset(second, ind=n)
            for i in range(num_of_senses_first):
                if synset_second in self._noun_synset(
                        first, i).common_hypernyms(synset_second):
                    return True
        return False

    def is_adjective(self, word):
        try:
            self._num_of_senses(word, 'a')
            return True
        except:
            return False

    def _noun_synset(self, noun, ind):
        self.wn()
        return self.WN.synset("%s.n.%s" % (noun, ind))

    def _num_of_senses(self, word, pos='n'):
        self.wn()
        return len(self.WN._lemma_pos_offset_map[word][pos])

    def is_person(self, word):
        return self.is_superclass_of(word, 'person')

    def is_animal(self, word):
        return self.is_superclass_of(word, 'animal')
Пример #13
0
class WordNetLookup(object):
    def __init__(self, path='corpora/wordnet'):
        self.path = path
        self.WN = None
    
    def wn(self):
        if not self.WN:
            self.WN = WordNetCorpusReader(nltk.data.find(self.path))
                    
    def is_superclass_of(self, first, second):
        "Is the second noun the superclass of the first one?"
        self.wn()
        try:
            num_of_senses_first = self._num_of_senses(first)
            num_of_senses_second = self._num_of_senses(second)
        except: return False
        for n in range(num_of_senses_second):
            synset_second = self._noun_synset(second, ind=n)
            for i in range(num_of_senses_first):
                if synset_second in self._noun_synset(first, i).common_hypernyms(synset_second):
                    return True
        return False
                
    def is_adjective(self, word):
        try: 
            self._num_of_senses(word, 'a')
            return True
        except: return False
    
    def _noun_synset(self, noun, ind):
        self.wn()
        return self.WN.synset("%s.n.%s" % (noun, ind))
    
    def _num_of_senses (self, word, pos='n'):
        self.wn()
        return len(self.WN._lemma_pos_offset_map[word][pos])
    
    def is_person(self, word):
        return self.is_superclass_of(word, 'person')
    
    def is_animal(self, word):
        return self.is_superclass_of(word, 'animal')
case_strategy = args.use_case_strategy == 'True'
number_strategy = args.use_number_strategy == 'True'
lp_strategy = args.use_lp == 'True'

case_freq = pickle.load(open(args.path_case_freq, 'rb'))
plural_freq = pickle.load(open(args.path_plural_freq, 'rb'))
lp_info = dict()

the_wn_version = '30'
# load relevant wordnet
if '171' in args.wsd_df_path:
    the_wn_version = '171'
    cwd = os.path.dirname(os.path.realpath(__file__))
    path_to_wn_dict_folder = os.path.join(cwd, 'scripts', 'wordnets', '171',
                                          'WordNet-1.7.1', 'dict')
    wn = WordNetCorpusReader(path_to_wn_dict_folder, None)

with open(args.sense_embeddings_path + '.freq', 'rb') as infile:
    meaning_freqs = pickle.load(infile)

with open(args.log_path, 'w') as outfile:
    json.dump(args.__dict__, outfile)


def lp_output(row, lp_info, candidate_synsets, debug=False):
    target_lemma = row['target_lemma']
    target_pos = row['pos']

    key = (target_lemma, target_pos)

    if key not in lp_info:
Пример #15
0
 def wn(self):
     if not self.WN:
         self.WN = WordNetCorpusReader(nltk.data.find(self.path))
Пример #16
0
# encoding: utf-8
from nltk.corpus.reader.wordnet import WordNetCorpusReader

wn = WordNetCorpusReader(YOUR_WORDNET_PATH, '.*')  # 这种方式就会有函数补全
print('wordnet version %s: %s' % (wn.get_version(), YOUR_WORDNET_PATH))

print'get gloss from sensekey......'
key = 'dance%1:04:00::'
lemma = wn.lemma_from_key(key)
synset = lemma.synset()
print synset.definition()
Пример #17
0
 def wn(self):
     if not self.WN:
         self.WN = WordNetCorpusReader(nltk.data.find(self.path))
Пример #18
0
import lxml.etree as et
import math
import numpy as np
import collections
import re
import random
from bs4 import BeautifulSoup
from bs4 import NavigableString
import pickle
from utils import path
from nltk.corpus.reader.wordnet import WordNetCorpusReader
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()  # download wordnet: import nltk; nltk.download("wordnet") in readme.txt

_path = path.WSD_path()
wn = WordNetCorpusReader(_path.WORDNET_PATH, '.*')
print('wordnet version %s: %s' % (wn.get_version(), _path.WORDNET_PATH))

path_words_notin_vocab = '../tmp/words_notin_vocab_{}.txt'

pos_dic = {
    'ADJ': u'a',
    'ADV': u'r',
    'NOUN': u'n',
    'VERB': u'v', }

POS_LIST = pos_dic.values()  # ['a', 'r', 'n', 'v']


def load_train_data(dataset):
    if dataset in _path.LS_DATASET:
Пример #19
0
def demo():
#    print('loading wordnet')
#    wn = WordNetCorpusReader(nltk.data.find('corpora/wordnet'))
#    print('done loading')
#    S = wn.synset
#    L = wn.lemma
#
#    print('getting a synset for go')
#    move_synset = S('go.v.21')
#    print(move_synset.name, move_synset.pos, move_synset.lexname)
#    print(move_synset.lemma_names)
#    print(move_synset.definition)
#    print(move_synset.examples)
#
#    zap_n = ['zap.n.01']
#    zap_v = ['zap.v.01', 'zap.v.02', 'nuke.v.01', 'microwave.v.01']
#
#    def _get_synsets(synset_strings):
#        return [S(synset) for synset in synset_strings]
#
#    zap_n_synsets = _get_synsets(zap_n)
#    zap_v_synsets = _get_synsets(zap_v)
#    zap_synsets = set(zap_n_synsets + zap_v_synsets)
#
#    print(zap_n_synsets)
#    print(zap_v_synsets)
#
#    print("Navigations:")
#    print(S('travel.v.01').hypernyms())
#    print(S('travel.v.02').hypernyms())
#    print(S('travel.v.03').hypernyms())
#
#    print(L('zap.v.03.nuke').derivationally_related_forms())
#    print(L('zap.v.03.atomize').derivationally_related_forms())
#    print(L('zap.v.03.atomise').derivationally_related_forms())
#    print(L('zap.v.03.zap').derivationally_related_forms())
#
#    print(S('dog.n.01').member_holonyms())
#    print(S('dog.n.01').part_meronyms())
#
#    print(S('breakfast.n.1').hypernyms())
#    print(S('meal.n.1').hyponyms())
#    print(S('Austen.n.1').instance_hypernyms())
#    print(S('composer.n.1').instance_hyponyms())
#
#    print(S('faculty.n.2').member_meronyms())
#    print(S('copilot.n.1').member_holonyms())
#
#    print(S('table.n.2').part_meronyms())
#    print(S('course.n.7').part_holonyms())
#
#    print(S('water.n.1').substance_meronyms())
#    print(S('gin.n.1').substance_holonyms())
#
#    print(L('leader.n.1.leader').antonyms())
#    print(L('increase.v.1.increase').antonyms())
#
#    print(S('snore.v.1').entailments())
#    print(S('heavy.a.1').similar_tos())
#    print(S('light.a.1').attributes())
#    print(S('heavy.a.1').attributes())
#
#    print(L('English.a.1.English').pertainyms())
#
#    print(S('person.n.01').root_hypernyms())
#    print(S('sail.v.01').root_hypernyms())
#    print(S('fall.v.12').root_hypernyms())
#
#    print(S('person.n.01').lowest_common_hypernyms(S('dog.n.01')))
#
#    print(S('dog.n.01').path_similarity(S('cat.n.01')))
#    print(S('dog.n.01').lch_similarity(S('cat.n.01')))
#    print(S('dog.n.01').wup_similarity(S('cat.n.01')))
#
#    wnic = WordNetICCorpusReader(nltk.data.find('corpora/wordnet_ic'),
#                                 '.*\.dat')
#    ic = wnic.ic('ic-brown.dat')
#    print(S('dog.n.01').jcn_similarity(S('cat.n.01'), ic))
#
#    ic = wnic.ic('ic-semcor.dat')
#    print(S('dog.n.01').lin_similarity(S('cat.n.01'), ic))
#
#    print(S('code.n.03').topic_domains())
#    print(S('pukka.a.01').region_domains())
#    print(S('freaky.a.01').usage_domains())

    wn = WordNetCorpusReader(nltk.data.find('corpora/wordnet'))
#    word = wn.synset('street.n.01')
#    
#    print word.lemma_names
#    print word.definition
#    print word.examples
#    print wn.lemma('dog.n.01.dog').synset
#    print word.hypernyms()
#    print word.hyponyms()
#    print word.member_holonyms()
#    print word.member_meronyms()
#    print word.root_hypernyms()
#    print
#    
#    
#    paths = word.hypernym_paths()
#    
#    
#    for path in paths:
#        print simple_path(path)
#        
#    from itertools import islice
#    for synset in islice(wn.all_synsets('n'), 5):
#        print synset, synset.hypernyms()
    
    
    
#    for synset in list(wn.all_synsets('n'))[:10]:
#        print synset
#    
#    print len(list(wn.all_synsets('n')))

#    road = wn.synsets("road", pos = wn.NOUN)
#    road = wn.synset('road.n.01')
#    paths = road.hypernym_paths()
#    
#    for path in paths:
#        print simple_path(path)
#        
#    paths = wn.synset("street.n.01").hypernym_paths()
#    for path in paths:
#        print simple_path(path)

#    print wn.synsets('geographic_area')

    
#    print_hyponyms(find_all_hyponyms(wn, wn.synset('way.n.06')))
    
#    print_hyponyms(find_all_hyponyms(wn, wn.synset('geological_formation.n.01')))
    
    
#    print wn.synsets('am', pos = wn.VERB)
    
#    print_hyponyms(find_all_hyponyms(wn, wn.synset('structure.n.01')))
    
#    syset = wn.synset('geographical_area.n.01').hyponyms()
#    syset = wn.synset('country.n.04').hyponyms()
#    for hyponym in syset:
#        print hyponym
#        print hyponym.definition
#        print
#    print len(syset)
    
#    print wn.synsets("institution", pos = wn.NOUN)

    for synset in wn.synsets('go', pos = wn.VERB):
        paths = synset.hypernym_paths()
        print synset
        print len(paths)
        print synset.definition
        for path in paths:
            print simple_path(path)
        print
Пример #20
0
    "january", "fabruary", "march", "april", "may", "june", "july", "august",
    "september", "october", "november", "december"
]
#with open('data/words.txt') as f:
#    dictionary = f.readlines()

dictionary = set(line.strip() for line in open('data/words.txt'))
#dictionary = set(line.strip() for line in open('data/words2.txt'))

common_names = set(line.strip() for line in open('data/common_names.txt'))

common_surnames = set(line.strip()
                      for line in open('data/common_surnames_conv3.txt'))

print('loading wordnet')
wn = WordNetCorpusReader(nltk.data.find('corpora/wordnet'), None)
print('done loading')
S = wn.synset
L = wn.lemma

tweetSentences = list([])


class TokenSentenceData:
    def __init__(self, token, tokenId):
        self.token = token  # instance variable unique to each instance
        self.tokenId = tokenId

    def __str__(self):
        return self.token
from nltk.corpus.reader.wordnet import WordNetCorpusReader
from matplotlib import pyplot as plt
from matplotlib_venn import venn3_unweighted

wn = WordNetCorpusReader("./resources/WordNet-3.0/dict",None)

adjectives = {a for a in wn.all_synsets('a')}
attributes = {n for n in wn.all_synsets('n') if n.lexname() == 'noun.attribute'}

direct_attributes = {attribute for adjective in adjectives
                               for attribute in adjective.attributes()}
morphologically_related = {related_lemma.synset() for adjective in adjectives
                                                  for lemma in adjective.lemmas()
                                                  for related_lemma in lemma.derivationally_related_forms()
                                                  if related_lemma.synset().pos() == 'n'}

diagram = venn3_unweighted([attributes, direct_attributes, morphologically_related],
                ['labeled as\nnoun.attribute', 'direct\nattributes', 'morphologically\nrelated nouns'])

for patch in diagram.patches:
    patch.set_edgecolor('k')
    patch.set_facecolor('w') # remove this line for color diagram.

plt.savefig('./images/venn.pdf')