Пример #1
0
def derivational_forms(first, second):
    #Checks if there are any derivationally related forms of the word lemmas that match
    f_syns = wn.synsets(first)
    s_syns = wn.synsets(second)
    #print s_syns
    try:
        for i in f_syns:
            sub_i = str(i)[8:-2]
            curr_lemma = wn.lemma(sub_i + "." + sub_i[0:sub_i.index(".")])
            derived_forms = curr_lemma.derivationally_related_forms()

            for derived in derived_forms:

                rep1 = str(derived)[7:str(derived).index(".")]
                index = str(derived).index(rep1) + len(rep1) + 6
                rep2 = str(derived)[index:-2]

                if rep1 == second:
                    return 1
                elif rep1 != rep2:
                    if rep2 == second:
                        return 1

                for s in s_syns:
                    s_str = str(s)[8:str(s).index(".")]
                    sub_s = str(s)[8:-2]

                    if rep1 == s_str:
                        return 1.5
                    elif rep1 != rep2:
                        if rep2 == s_str:
                            return 1.5

                    s_curr_lemma = wn.lemma(sub_s + "." + s_str)
                    s_derived_forms = s_curr_lemma.derivationally_related_forms(
                    )

                    for match in s_derived_forms:

                        match_rep1 = str(derived)[7:str(derived).index(".")]
                        match_index = str(derived).index(rep1) + len(rep1) + 6
                        match_rep2 = str(derived)[match_index:-2]

                        if match_rep1 == first:
                            return 2
                        elif match_rep1 != match_rep2:
                            if match_rep2 == first:
                                return 2

                        if match in derived_forms or match_rep1 == rep1 or match_rep1 == rep2:
                            return 1
    except:
        pass

    return 0
Пример #2
0
 def lemma(self, name, lang='eng'):
     lemma = wn.lemma(name, lang=lang)
     # lemma._vector = self._vector(lemma)
     # lemma._freqs = {}
     # for t in self._TOPICS:
     #     lemma._freqs[t] = self._lemma_freq(lemma, t)
     return lemma
Пример #3
0
def _mk_synset(w):
    #
    # (synset form) cat.n.01 into the Synset object form
    # (lemma form) syndicate.n.01.crime_syndicate
    #

    word = w.strip().replace(' ', '_')

    pat_regular_form = re.compile(r".*[.]\d{2}$")
    pat_regular_lemma_form = re.compile(r".*[.]\d{2}[.].+$")

    if pat_regular_form.match(word):
        try:
            return wordnet.synset(word)
        except Exception as ex:
            try:
                # try the first for the stem word
                return wordnet.synsets(word.split('.')[0])[0]
            except Exception as ex:
                return None

    elif pat_regular_lemma_form.match(word):
        try:
            return wordnet.lemma(word).synset()
        except Exception as ex:
            return None

    else:
        print(' * Error, invalid synset name: [{}] skipping'.format(w))
        return None
Пример #4
0
def _mk_synset(w):
    #
    # (synset form) cat.n.01 into the Synset object form
    # (lemma form) syndicate.n.01.crime_syndicate
    #

    word = w.strip().replace(" ", "_")

    if word.count(".") == 2:
        try:
            return wordnet.synset(word)
        except Exception as ex:
            try:
                # try the first for the stem word
                return wordnet.synsets(word.split(".")[0])[0]
            except Exception as ex:
                return None

    elif word.count(".") == 3:
        try:
            return wordnet.lemma(word).synset()
        except Exception as ex:
            return None

    else:
        print(" * Error, invalid synset name", w, "skipping...")
        return None
Пример #5
0
 def _antonyms(self):
     try:
         return wn.lemma("%s.%s.1.%s"%(self.wnbase,
             self.postag,
             self.lemma)).antonyms()
     except:
         return []
Пример #6
0
def lemma(name_synsets):
    """
        This function return lemma object given the name.

        .. note::
            Support only English language (*eng*).

        :param str name_synsets: name of the synset

        :return: lemma object with the given name
        :rtype: :class:`Lemma`

        :Example:

            >>> from pythainlp.corpus.wordnet import lemma
            >>>
            >>> lemma('practice.v.01.exercise')
            Lemma('practice.v.01.exercise')
            >>>
            >>> lemma('drill.v.03.exercise')
            Lemma('drill.v.03.exercise')
            >>>
            >>> lemma('exercise.n.01.exercise')
            Lemma('exercise.n.01.exercise')
    """
    return wordnet.lemma(name_synsets)
Пример #7
0
def get_antonym(word):

    print "Antonym for: " + word

    if len(word.split()) > 1:
        word = word.replace(" ", "_")

    # the slow part
    wnsynset = wn.synsets(word)

    print "WYNSET" + str(wnsynset)
    antonym = None
    # only getting one antonym
    for i in wnsynset:
        for el in i.lemmas():
            x = el.antonyms()
            if len(x) > 0:
                print "Antonym"
                antonym = x[0].name()
                break
    syn_set = []
    if antonym is not None:
        print "synonyms for antonym " + str(antonym)

        if len(antonym.split()) > 1:
            word = antonym.replace(" ", "_")

        # the slow part
        wnsynset = wn.synsets(antonym)

        print "WYNSET" + str(wnsynset)

        for i in range(0, len(wnsynset)):
            for lemma in wnsynset[i].lemma_names():
                print "LEMMA"
                print lemma

                syn_set.append(lemma)

                deriv = wn.lemma(wnsynset[i].name() + "." + lemma)
                print "DERIVATIONS"
                for x in deriv.derivationally_related_forms():
                    print x.name
                    syn_set.append(x.name())

            print "Hyponym function: "
            for hypo in wnsynset[i].hyponyms():
                syn_set.append(re.findall(r"[a-zA-Z]*", hypo.name())[0])
                print re.findall(r"[a-zA-Z]*", hypo.name())[0]
            '''
            print "Hypernym function: " 
            for hyper in wnsynset[i].hypernyms():
                syn_set.append(re.findall(r"[a-zA-Z]*",hyper.name())[0])
                print re.findall(r"[a-zA-Z]*",hyper.name())[0]
            '''

    return syn_set
Пример #8
0
def patched_lemma_from_key(key, wordnet=wordnet):
    try:
        lemma = wordnet.lemma_from_key(key)
    except WordNetError as e:
        if key in patching_data:
            lemma = wordnet.lemma(patching_data[key])
        elif '%3' in key:
            lemma = wordnet.lemma_from_key(key.replace('%3', '%5'))
        else:
            raise e
    return lemma
Пример #9
0
	def _generate_fingerprint(self, lemma):
		wordnet_lemma = wn.lemma(lemma)
		wordnet_lemmas = gen_fingerprints.get_related_lemmas(wordnet_lemma) 

		lemmas = [l.synset().name() + "." + l.name() for l in wordnet_lemmas]

		fp = set()
		for lemma in lemmas:
			bits = self._get_bits_for_lemma(lemma)
			fp.update(bits)
		return fp
Пример #10
0
    def _generate_fingerprint(self, lemma):
        wordnet_lemma = wn.lemma(lemma)
        wordnet_lemmas = gen_fingerprints.get_related_lemmas(wordnet_lemma)

        lemmas = [l.synset().name() + "." + l.name() for l in wordnet_lemmas]

        fp = set()
        for lemma in lemmas:
            bits = self._get_bits_for_lemma(lemma)
            fp.update(bits)
        return fp
Пример #11
0
    def extractWordsAndSynsets(self, filenameWords, filenameSynsets,  filenameLexemes):
        #file
        fWords = codecs.open(filenameWords, 'w', 'utf-8')
        fSynsets = codecs.open(filenameSynsets, 'w',  'utf-8')
        fLexemes = codecs.open(filenameLexemes, 'w',  'utf-8')

        wordCounter = 0
        wordCounterAll = 0
        synsetCounter = 0
        synsetCounterAll = 0
        lexemCounter = 0
        lexemCounterAll = 0

        ovv = []

        for pos in self.pos_list:
            for word in wn.all_lemma_names(pos=pos, lang=self.lang):
                wordCounterAll += 1
                self.WordIndex[word] = wordCounterAll
                fWords.write(word+" ")
                synsetInWord = 0
                for synset in wn.synsets(word, lang=self.lang):
                    lexemCounterAll += 1
                    synsetId = synset.name()
                    if self.Shared.in_vocab(synsetId):
                        synsetInWord += 1
                        if synsetId not in self.SynsetIndex:
                            fSynsets.write(synsetId + " " + self.Shared.getVectorAsString(self.Shared.model[synsetId]) + "\n")
                            synsetCounter += 1
                            self.SynsetIndex[synsetId] = synsetCounter

                        lexemCounter += 1
                        #lemma name
                        sensekey = wn.lemma(synset.name()+'.'+word).key()

                        fWords.write(sensekey + ",")
                        fLexemes.write(str(self.SynsetIndex[synsetId]) + " " + str(wordCounterAll) + "\n")
                    else:
                        ovv.append(synsetId)


                fWords.write("\n")
                if synsetInWord is not 0:
                    wordCounter += 1
                else:
                    self.WordIndex[word] = -1
        fWords.close()
        fSynsets.close()
        fLexemes.close()
        print("   Words: %d / %d\n" % (wordCounter, wordCounterAll))
        print("  Synset: %d / %d\n" % (synsetCounter, synsetCounter + len(ovv)))
        print("  Lexems: %d / %d\n" % (lexemCounter, lexemCounterAll))
Пример #12
0
def find_antonym(word, pos_tag):
  global print_statements

  tag = pos_tag[0:2]
  if tag != 'JJ':
    return word

  s = str(wn.lemma(word+".a.01."+word).antonyms())
  
  if print_statements:
    print "Found antonym:", s
  
  start = s.find("'")
  end = s.find(".")
  result = s[start+1:end]
  return result
Пример #13
0
def find_antonym(word, pos_tag):
  global print_statements

  tag = pos_tag[0:2]
  if tag != 'JJ':
    return word

  s = str(wn.lemma(word+".a.01."+word).antonyms())
  
  if print_statements:
    print "Found antonym:", s
  
  start = s.find("'")
  end = s.find(".")
  result = s[start+1:end]
  return result
def tokens(sent, palabra):
    keyword = 0
    sent
    for word in sent:
        keyword = keyword + 1
        if palabra == word:
            synsent_palabra = wn.synset(palabra + ".n.01")
            s_palabra = str(synsent_palabra)
            su_hiponimo = synsent_palabra.hyponyms()
            if su_hiponimo:
                print("Estos son sus hiponimos de ", palabra + ":",
                      sorted(su_hiponimo[0:]))
            else:
                print("No se encontraron hiponimos")

            su_hiperonimos = synsent_palabra.hypernyms()
            if su_hiperonimos:
                print("Estos son sus hyperonimos", palabra + ":",
                      sorted(su_hiperonimos[0:]))
            else:
                print("No se encontraro hipernomios")

            su_holonimo = synsent_palabra.member_holonyms()
            if su_holonimo:
                print("Estos son sus holonimos de ", palabra + ":",
                      su_holonimo)
            else:
                print("Para esa palabra no se encontraron holonimos")

            su_consecutivo_logico = synsent_palabra.entailments()
            if su_consecutivo_logico:
                print("Estos son sus consecutivos logico de ", palabra + ":",
                      su_consecutivo_logico)
            else:
                print("Para esa plabra no se encontraron consecutivos logicos")

            antonimo = [
                str(lemma.name()) for lemma in synsent_palabra.lemmas()
            ]
            el_antonimo = str(palabra + ".n.01." + antonimo[0])
            su_antonimo = wn.lemma(el_antonimo).antonyms()
            print(antonimo)
            if su_antonimo:
                print("Estos son sus antonimos de ", palabra + ":",
                      su_antonimo)
            else:
                print("Para esa palabra no se encontraron antonimos")
Пример #15
0
def relations():

    wn.synset('tree.n.01').part_meronyms()
    wn.synset('tree.n.01').substance_meronyms()
    wn.synset('tree.n.01').member_holonyms()

    for synset in wn.synsets('mint', wn.NOUN):
        print synset.name + ':', synset.definition

    wn.synset('mint.n.04').part_holonyms()
    wn.synset('mint.n.04').substance_holonyms()

    wn.synset('walk.v.01').entailments()
    wn.synset('eat.v.01').entailments()
    wn.synset('tease.v.03').entailments()

    wn.lemma('supply.n.02.supply').antonyms()
    wn.lemma('rush.v.01.rush').antonyms()
    wn.lemma('horizontal.a.01.horizontal').antonyms()
    wn.lemma('staccato.r.01.staccato').antonyms()
Пример #16
0
def relations():

    wn.synset('tree.n.01').part_meronyms()
    wn.synset('tree.n.01').substance_meronyms()
    wn.synset('tree.n.01').member_holonyms()

    for synset in wn.synsets('mint', wn.NOUN):
        print synset.name + ':', synset.definition

    wn.synset('mint.n.04').part_holonyms()
    wn.synset('mint.n.04').substance_holonyms()

    wn.synset('walk.v.01').entailments()
    wn.synset('eat.v.01').entailments()
    wn.synset('tease.v.03').entailments()

    wn.lemma('supply.n.02.supply').antonyms()
    wn.lemma('rush.v.01.rush').antonyms()
    wn.lemma('horizontal.a.01.horizontal').antonyms()
    wn.lemma('staccato.r.01.staccato').antonyms()
Пример #17
0
def antonym_dict(word_list):
  CSAT_ant_dict = {}
  for syn in word_list:
    name_list = []
    syns = wn.synsets(syn)
    names = [s.name() for s in syns]
    for name in names:
      if syn in name:
        name_list.append(name)
      try:
        for n in name_list:
          stem_name = n.split('.')[0]
          atn = wn.lemma('{}.{}'.format(n, stem_name)).antonyms()
          if atn is not []:
            print('{}의 반의어는 {}'.format(stem_name, atn))
            CSAT_ant_dict[stem_name] = ant
      except:
        pass
  return CSAT_ant_dict
Пример #18
0
def wordnet():

    wn.synsets('motorcar')
    wn.synset('car.n.01').lemma_names
    wn.synset('car.n.01').definition
    wn.synset('car.n.01').examples

    wn.synset('car.n.01').lemmas
    wn.lemma('car.n.01.automobile')
    wn.lemma('car.n.01.automobile').synset
    wn.lemma('car.n.01.automobile').name

    wn.synsets('car')
    for synset in wn.synsets('car'):
        print synset.lemma_names

    wn.lemmas('car')
Пример #19
0
def wordnet():

    wn.synsets('motorcar')
    wn.synset('car.n.01').lemma_names
    wn.synset('car.n.01').definition
    wn.synset('car.n.01').examples

    wn.synset('car.n.01').lemmas
    wn.lemma('car.n.01.automobile') 
    wn.lemma('car.n.01.automobile').synset
    wn.lemma('car.n.01.automobile').name

    wn.synsets('car')
    for synset in wn.synsets('car'):
        print synset.lemma_names

    wn.lemmas('car')
Пример #20
0
def playWithWordNet(word):
    syn = wn.synsets(word)
    print(syn)

    syns = wn.synset('strange.a.01').lemma_names()
    syns2 = wn.synset('strange.s.02').lemma_names()
    defn = wn.synset('strange.s.02').definition()
    ex = wn.synset('strange.s.02').examples()
    lems = wn.synset('strange.s.02').lemmas()
    # name = wn.lemma('strange.s.02').name()
    print(syns, defn, ex)

    for synset in syn:
        print(synset.lemma_names())

    stranges = wn.lemmas('strange')
    print(stranges)

    synset1 = wn.synset('strange.s.02')
    types_of_strange = synset1.hyponyms()
    supersets_of_strange = synset1.hypernyms()
    root_hypernyms = synset1.root_hypernyms()
    paths = synset1.hypernym_paths()
    path1 = [synset.name() for synset in paths[0]]
    print(types_of_strange, supersets_of_strange, root_hypernyms, paths, path1)

    tree = wn.synset('human.n.01')
    parts = tree.part_meronyms()
    subst_parts = tree.substance_meronyms()
    wholes = tree.member_holonyms()
    print(tree, parts, subst_parts, wholes)

    entails = wn.synset('walk.v.01').entailments()

    antys = wn.lemma('rush.v.01.rush').antonyms()

    specificity = wn.synset('baleen_whale.n.01').min_depth()
Пример #21
0
def semanticScore(word):

    pluralizer = inflect.engine()

    syn_set = []

    wnsynset = wn.synsets(word)

    syn_set_final = []

    for i in range(0, len(wnsynset)):
        for lemma in wnsynset[i].lemma_names():

            syn_set.append(lemma)
            deriv = wn.lemma(wnsynset[i].name() + "." + lemma)
            for x in deriv.derivationally_related_forms():
                syn_set.append(x.name())
        #print "Hypernym function: "
        for hyper in wnsynset[i].hypernyms():
            syn_set.append(re.findall(r"[a-zA-Z]*", hyper.name())[0])
        #print "Hyponym function: "
        for hypo in wnsynset[i].hyponyms():
            syn_set.append(re.findall(r"[a-zA-Z]*", hypo.name())[0])

        # adds plurals and removes dups

        syn_setnodup = []
        for item in syn_set:
            if item not in syn_setnodup:
                syn_setnodup.append(item)

        syn_set_final = []
        for item in syn_setnodup:
            syn_set_final.append(item)
            syn_set_final.append(pluralizer.plural(item))

    return syn_set_final
Пример #22
0
from nltk.corpus import wordnet as wn

def get_antonyms(lemma):
    antonyms = [ant.name for ant in lemma.antonyms()]
    antonyms.extend([ant.name for similar in lemma.synset.similar_tos() for lemmas in similar.lemmas for ant in lemmas.antonyms()])
    return antonyms

if __name__ == "__main__":
    print(get_antonyms(wn.lemma('alacritous.s.01.alacritous')))
    print(get_antonyms(wn.lemma('sluggish.s.01.sluggish')))
    print(get_antonyms(wn.lemma('adust.s.01.parched')))
Пример #23
0
#!/usr/bin/python
#===================================================================
# This codelet reads the vocabulary lemmas and verifies that each
# is found in NLTK WordNet.  Some lemmas in WordNet cannot be looked
# up because of parsing errors due to dots (.) in the lemma name.
# Copyright 2014, IEEE ENCS Humanoid Robot Project
#===================================================================

from nltk.corpus import wordnet as wn

with open('vocab_lemmas.txt', 'r') as f:
    for line in f:
        try:
            wn.lemma(line.strip())  # will blow up if line isn't a lemma
        except:
            print line.strip()
print()
syns = wn.synsets("dog")
print(syns)
print(wn.synsets('dog', pos=wn.VERB))  # chase: kovalamak
print(wn.synset('dog.n.01').definition(), "\n")

print(len(wn.synset('dog.n.01').examples()))  # 1
print(wn.synset('dog.n.01').examples()[0], "\n")  # The dog barked all night

print(
    "lemmas: ",
    wn.synset('dog.n.01').lemmas()
)  #[Lemma('dog.n.01.dog'), Lemma('dog.n.01.domestic_dog'), Lemma('dog.n.01.Canis_familiaris')]
[str(lemma.name()) for lemma in wn.synset('dog.n.01').lemmas()
 ]  #['dog', 'domestic_dog', 'Canis_familiaris']
print(wn.lemma('dog.n.01.dog').synset(), "\n")  #Synset('dog.n.01')
"""For example, pigeon, crow, eagle and seagull are all hyponyms of bird (their hypernym); which, in turn, is a hyponym of animal.[3]"""
dog = wn.synset('dog.n.01')
print(
    "hypernyms : ",
    dog.hypernyms())  #[Synset('canine.n.02'), Synset('domestic_animal.n.01')]
print(
    "hyponyms: ", dog.hyponyms()
)  # doctest: +ELLIPSIS [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), ...]
print("member_holonyms: ",
      dog.member_holonyms())  # [Synset('canis.n.01'), Synset('pack.n.06')]
print("root_hypernyms: ", dog.root_hypernyms())  #[Synset('entity.n.01')]
print("lowest_common_hypernyms cat/dog: ",
      wn.synset('dog.n.01').lowest_common_hypernyms(
          wn.synset('cat.n.01')))  #[Synset('carnivore.n.01')]
Пример #25
0
def extractFeatureValues(sent, j, usePredictedLabels=True, orders={0,1}, indexer=None,
                         candidatesThisSentence=None):
    '''
    Extracts a map of feature names to values for a particular token in a sentence.
    These can be aggregated to get the feature vector or score for a whole sentence.
    These replicate the features used in Ciaramita and Altun, 2006 
    
    @param sent: the labeled sentence object to extract features from
    @param j: index of the word in the sentence to extract features for
    @param usePredictedLabels: whether to use predicted labels or gold labels (if available) 
    for the previous tag. This only applies to first-order features.
    @param orders: list of orders; e.g. if {1}, only first-order (tag bigram) features will be extracted
    @return: feature name -> value
    '''
    
    (lexiconCandidates, listCandidates), supersenseCandidates = candidatesThisSentence or (({}, {}), [])
    
    ff = IndexedFeatureMap(indexer) if indexer is not None else {}
    
    # note: in the interest of efficiency, we use tuples rather than string concatenation for feature names
    
    # previous label feature (first-order Markov dependency)
    if 1 in orders and hasFirstOrderFeatures() and j>0:
            ff["prevLabel=",(sent[j-1].prediction if usePredictedLabels else sent[j-1].gold)] = 1
    
    if 0 in orders:
        # bias
        ff[()] = 1
        
        
         
        # original token, token position-in-sentence features
        if sent[j].token[0].isupper():
            #ff['capitalized_BOS' if j==0 else 'capitalized_!BOS'] = 1 # old version of feature (in mweFeatures)
            nCap = sum(1 for tkn in sent if tkn.token[0].isupper())
            if j==0:
                ff['capitalized_BOS'] = 1
                if nCap>=(len(sent)-nCap):
                    ff['capitalized_BOS_majcap'] = 1
            else:
                ff['capitalized_!BOS'] = 1
                if nCap>=(len(sent)-nCap):
                    ff['capitalized_!BOS_majcap'] = 1
                if sent[0].token[0].islower():
                    ff['capitalized_!BOS_BOSlower'] = 1
        ff['shape', sent[j].shape] = 1
        if j<2:
            ff['offset_in_sent=',str(j)] = 1
        if len(sent)-j<2:
            ff['offset_in_sent=',str(j-len(sent))] = 1
        
        # lowercased token features
        w = sent[j].token.lower()
        
        # - prefix (up to 4)
        # - suffix (up to 4)
        for k in range(4):
            ff['w[:{}]'.format(k+1), w[:k+1]] = 1
            ff['w[{}:]'.format(-k-1), w[-k-1:]] = 1
        
        # - special characters
        for c in w:
            if c.isdigit():
                ff['has-digit'] = 1
            elif not c.isalpha():
                ff['has-char', c] = 1
        
        # - context word up to 2 away
        # - context POS up to 2 words away
        # - context word bigram
        # - context POS bigram
        # - current lemma and context lemma up to 2 words away, if one of them is a verb 
        #   and the other is a noun, verb, adjective, adverb, preposition, or particle
        for k in range(j-2,j+3):
            if k<0: continue
            elif k>len(sent)-1: break
            ff['w_{:+}'.format(k-j), sent[k].token.lower()] = 1
            ff['pos_{:+}'.format(k-j), sent[k].pos] = 1
            if k!=j and ( \
                    (sent[k].pos[0]=='V' and sent[j].pos[0] in {'V','N','J','I','R','T'}) \
                 or (sent[j].pos[0]=='V' and sent[k].pos[0] in {'V','N','J','I','R','T'})):
                    ff['lemma_+0,{:+}'.format(k-j), sent[j].stem, sent[k].stem] = 1
            if k<j+2 and k<len(sent)-1:
                if useTokenBigrams: ff['w_{:+},{:+}'.format(k-j,k-j+1), sent[k].token.lower(), sent[k+1].token.lower()] = 1
                ff['pos_{:+},{:+}'.format(k-j,k-j+1), sent[k].pos, sent[k+1].pos] = 1
            if clusterMap and (k==j or abs(k-j)==1): # current and neighbor clusters
                clustid, keywords = wordClusterID(sent[k].token.lower())
                ff['c_{:+1}'.format(k-j), clustid, keywords or ''] = 1
                if k!=j:
                    ff['lemma_+0,c_{:+}'.format(k-j), sent[j].stem, clustid, keywords or ''] = 1
        
        # - word + context POS
        # - POS + context word
        if j>0:
            ff['w_+0_pos_-1', sent[j].token.lower(), sent[j-1].pos] = 1
            ff['w_-1_pos_+0', sent[j-1].token.lower(), sent[j].pos] = 1
        if j<len(sent)-1:
            ff['w_+0_pos_+1', sent[j].token.lower(), sent[j+1].pos] = 1
            ff['w_+1_pos_+0', sent[j+1].token.lower(), sent[j].pos] = 1
        
        
        # - auxiliary verb/main verb (new relative to mweFeatures)
        if coarsen(sent[j].pos)=='V':
            cposes = [coarsen(tok.pos) for tok in sent[j:]]
            if len(cposes)>1 and cposes[1]=='V':
                # followed by another verb: probably an aux (though there are exceptions: 
                # "try giving", "all people want is", etc.)
                ff['auxverb'] = 1
            elif len(cposes)>2 and cposes[1]=='R' and cposes[2]=='V':
                # followed by an adverb followed by a verb: probably an aux
                ff['auxverb'] = 1
            else:
                ff['mainverb'] = 1
        
        
        # lexicon features
        
        if not wn.lemmas(sent[j].stem):
            if useWNOOV: ff['OOV',sent[j].pos] = 1
            wn_pos_setS = '{}'
        else:
            wn_pos_set = frozenset({lem.synset().pos().replace('s','a') for lem in wn.lemmas(sent[j].stem)})
            wn_pos_setS = '{'+repr(tuple(wn_pos_set))[1:-1]+'}'
        
        # - WordNet supersense (new relative to mweFeatures)
        extractWNSupersenseFeat(ff, j, supersenseCandidates)
        
        if useWNCompound:
            # - compound
            if sent[j].pos.isalnum():
                prevtok = None
                for tok in sent[j-1::-1]:
                    if tok.pos=='HYPH':
                        continue
                    elif tok.pos.isalnum():
                        prevtok = tok
                    break
                nexttok = None
                for tok in sent[j+1:]:
                    if tok.pos=='HYPH':
                        continue
                    elif tok.pos.isalnum():
                        nexttok = tok
                    break
                
                if sent[j].pos=='HYPH':
                    if isCompound(prevtok,nexttok):
                        ff['compound_left_right'] = 1
                else:
                    if isCompound(prevtok,sent[j]):
                        ff['compound_left'] = 1
                    if isCompound(sent[j],nexttok):
                        ff['compound_right'] = 1
        
        
        nMatches = Counter()
        for lexiconname,segmentation in lexiconCandidates.items():
            toffset,tag,expr_tokens,is_gappy_expr,entry = segmentation[j]
            assert toffset==j
            if lexiconname=='wordnet_mwes':
                if entry:
                    try:
                        mw_pos_set = frozenset(wn.lemma(wnlemma).synset().pos().replace('s','a') for wnlemma in entry["wnlemmas"])
                    except:
                        print(entry, file=sys.stderr)
                        raise
                    mw_pos_setS = '{'+repr(tuple(mw_pos_set))[1:-1]+'}'
                    ff['wn',wn_pos_setS,tag,mw_pos_setS] = 1
                else:
                    ff['wn',wn_pos_setS,tag] = 1
            
            if tag.upper()!='O':
                lbl = entry["label"]
                if not lbl.startswith('NE:') and SENSENUM.search(lbl):
                    lbl = '<sense-tagged>'
                ff['lex',lexiconname,tag.upper(),str(is_gappy_expr),lbl] = 1
                if True or entry["datasource"].lower()!='wikimwe':   # TODO: OK to remove constraint for wikimwe?
                    p1 = sent[expr_tokens[0]].pos
                    p2 = sent[expr_tokens[-1]].pos
                    ff['lex',lexiconname,tag.upper(),str(is_gappy_expr),lbl,p1,'...',p2] = 1
                    nMatches[p1,p2] += 1
                nMatches[None,None] += 1
            else:
                ff['lex',lexiconname,'O'] = 1
            
        if nMatches[None,None]==0:
            ff['#lex-matches=','0'] = 1
        else:
            for n in range(1,nMatches[None,None]+1):
                ff['#lex-matches>=',str(n)] = 1
            for (p1,p2),N in nMatches.items():
                if (p1,p2)!=(None,None):
                    for n in range(1,N+1):
                        ff['#lex-matches',p1,'...',p2,'>=',str(n)] = 1
        
        #sentpos = ''.join(coarsen(w.pos) for w in sent)
        #cposj = coarsen(sent[j].pos)
        
        
        # - collocation extraction lists
        # lists for 6 collocation classes: adj-noun noun-noun preposition-noun verb-noun verb-preposition verb-particle 
        # each list ranks lemma pairs using the t-test.
        # considering each list separately, we segment the sentence preferring higher-ranked items 
        # (requiring lemmas and coarse POSes to match). 
        # fire features indicating (a) B vs. I match, and (b) whether the rank in the top 
        # {25,50,75,100,150,200,300,...,900,1000,2000,...,9000,10k,20k,...90k,100k,200k,...}, 
        # (c) gappiness?
        
        
        for listname,segmentation in listCandidates.items():
            toffset,tag,expr_tokens,is_gappy_expr,entry = segmentation[j]
            assert toffset==j
            
            if tag.upper()!='O':
                lbl = entry["label"]
                is_phrasinator = (entry["datasource"].lower().startswith('phrasinator'))
                ff['list',listname,tag.upper(),str(is_gappy_expr),lbl] = 1
                
                
                p1 = sent[expr_tokens[0]].pos
                p2 = sent[expr_tokens[-1]].pos
                if is_phrasinator:
                    ff['list',listname,tag.upper(),str(is_gappy_expr),lbl,p1,'...',p2] = 1
                r = entry["rank"]
                for t in THRESHOLDS:
                    if r>t: break
                    ff['list',listname,'rank<={}'.format(t), tag.upper(),str(is_gappy_expr),lbl] = 1
                    if is_phrasinator:
                        ff['list',listname,'rank<={}'.format(t), tag.upper(),str(is_gappy_expr),lbl,p1,'...',p2] = 1
                
            else:
                ff['list',listname,'O'] = 1
                
    return ff
Пример #26
0
def latihan_wordnet():
    wn.synsets('motorcar')
    wn.synset('car.n.01').lemma_names
    wn.synset('car.n.01').lemmas
    wn.synset('car.n.01').definition
    wn.synset('car.n.01').examples
    wn.synset('car.n.01').definition
    #'a motor vehicle with four wheels; usually propelled by an internal combustion engine'
    wn.synset('car.n.01').examples
    #['he needs a car to get to work']
    #atau
    wn.lemmas('car')
    #[Lemma('car.n.01.car'), Lemma('car.n.02.car'), Lemma('car.n.03.car'),
    #Lemma('car.n.04.car'), Lemma('cable_car.n.01.car')]
    wn.synsets('car')
    #[Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'),
    #Synset('cable_car.n.01')]
    for synset in wn.synsets('car'):
       print synset.lemma_names
    #['car', 'auto', 'automobile', 'machine', 'motorcar']
    #['car', 'railcar', 'railway_car', 'railroad_car']
    #['car', 'gondola']
    #['car', 'elevator_car']
    #['cable_car', 'car']
    motorcar = wn.synset('car.n.01')
    types_of_motorcar = motorcar.hyponyms()
    len(types_of_motorcar)
    #31
    types_of_motorcar[26]
    #Synset('ambulance.n.01')
    sorted([lemma.name for synset in types_of_motorcar for lemma in synset.lemmas])
    #['Model_T', 'S.U.V.', 'SUV', 'Stanley_Steamer', 'ambulance', 'beach_waggon',
    #...]
    motorcar.hypernyms()
    #[Synset('motor_vehicle.n.01')]
    paths = motorcar.hypernym_paths()
    len(paths)
    #2
    [synset.name for synset in paths[0]]
    #['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'artifact.n.01',
    #'instrumentality.n.03', 'container.n.01', 'wheeled_vehicle.n.01',
    #'self-propelled_vehicle.n.01', 'motor_vehicle.n.01', 'car.n.01']
    [synset.name for synset in paths[1]]
    #['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'artifact.n.01',
    #'instrumentality.n.03', 'conveyance.n.03', 'vehicle.n.01', 'wheeled_vehicle.n.01',
    #'self-propelled_vehicle.n.01', 'motor_vehicle.n.01', 'car.n.01']
    motorcar.root_hypernyms()
    #[Synset('entity.n.01')]
    wn.synset('tree.n.01').part_meronyms()
    #[Synset('burl.n.02'), Synset('crown.n.07'), Synset('stump.n.01'),
    #Synset('trunk.n.01'), Synset('limb.n.02')]
    wn.synset('tree.n.01').substance_meronyms()
    #[Synset('heartwood.n.01'), Synset('sapwood.n.01')]
    wn.synset('tree.n.01').member_holonyms()
    #[Synset('forest.n.01')]
    for synset in wn.synsets('mint', wn.NOUN):
        print synset.name + ':', synset.definition
    #batch.n.02: (often followed by `of') a large number or amount or extent
    #mint.n.02: any north temperate plant of the genus Mentha with aromatic leaves and
    #small mauve flowers
    #mint.n.03: any member of the mint family of plants
    #mint.n.04: the leaves of a mint plant used fresh or candied
    #mint.n.05: a candy that is flavored with a mint oil
    #mint.n.06: a plant where money is coined by authority of the government
    wn.synset('mint.n.04').part_holonyms()
    #[Synset('mint.n.02')]
    wn.synset('mint.n.04').substance_holonyms()
    #[Synset('mint.n.05')]
    wn.synset('walk.v.01').entailments()
    #[Synset('step.v.01')]
    wn.synset('eat.v.01').entailments()
    #[Synset('swallow.v.01'), Synset('chew.v.01')]
    wn.synset('tease.v.03').entailments()
    #[Synset('arouse.v.07'), Synset('disappoint.v.01')]
    #Antonym
    wn.lemma('supply.n.02.supply').antonyms()
    #[Lemma('demand.n.02.demand')]
    wn.lemma('rush.v.01.rush').antonyms()
    #[Lemma('linger.v.04.linger')]
    wn.lemma('horizontal.a.01.horizontal').antonyms()
    #[Lemma('vertical.a.01.vertical'), Lemma('inclined.a.02.inclined')]
    wn.lemma('staccato.r.01.staccato').antonyms()
    #[Lemma('legato.r.01.legato')]

    #Semantic Similarity
    #Semakin dekat path antara dua lemma, semakin mirip makna semantik kedua lemma tersebut
    right = wn.synset('right_whale.n.01')
    orca = wn.synset('orca.n.01')
    minke = wn.synset('minke_whale.n.01')
    tortoise = wn.synset('tortoise.n.01')
    novel = wn.synset('novel.n.01')
    print right.lowest_common_hypernyms(minke)
    #[Synset('baleen_whale.n.01')]
    print right.lowest_common_hypernyms(orca)
    #[Synset('whale.n.02')]
    print right.lowest_common_hypernyms(tortoise)
    #[Synset('vertebrate.n.01')]
    print right.lowest_common_hypernyms(novel)
    #[Synset('entity.n.01')]
    print wn.synset('baleen_whale.n.01').min_depth()
    #14
    print wn.synset('whale.n.02').min_depth()
    #13
    print wn.synset('vertebrate.n.01').min_depth()
    #8
    print wn.synset('entity.n.01').min_depth()
    #0
    print right.path_similarity(minke)
    #0.25
    print right.path_similarity(orca)
    #0.16666666666666666
    print right.path_similarity(tortoise)
    #0.076923076923076927
    print right.path_similarity(novel)
    #0.043478260869565216

    ##nltk web
    #from __future__ import division
    import nltk, re, pprint

    #from urllib import urlopen
    url = "http://www.gutenberg.org/files/2554/2554.txt"
    raw = urlopen(url).read()
    len(raw)
    raw[:75]

    #from __future__ import division
    #import nltk, re, pprint
    #from urllib import urlopen
    url = "http://www.gutenberg.org/files/2554/2554.txt"
    print "Accessing gutenberg #2554..."
    raw = urlopen(url).read()
    tokens = nltk.word_tokenize(raw)
    text = nltk.Text(tokens)
    text.concorddance("Gutenberg")
    text.collocations()
    text.similarity()

    #Mengakses data dengan tag HTML
    url = 'http://news.bbc.co.uk/2/hi/health/2284783.stm'
    htmlsite = urlopen(url)
    htmldata = htmlsite.read()
    htmlraw = nltk.clean_html(htmldata)
    htmltokens = nltk.word_tokenize(htmlraw)
    htmltexts = nltk.Text(htmltokens)
    htmltexts.concordance('gene')

    #Mengakses Berkas Lokal
    f = open('document.txt', 'r')
    data = f.read()
    f.close()
    tokens = nltk.word_tokenize(data)
    texts = nltk.Text(tokens)
    texts.concordance('gene')

    #Menulis Berkas Lokal
    f = open('document.txt', 'w')
    for word in sorted(htmltexts):
        f.write(word + '\n')

    #Mengakses RSS Feed
    import feedparser
    url = 'http://news.bbc.co.uk/2/hi/health/2284783.stm'
    htmlsite = urlopen(url)
    htmldata = htmlsite.read()
    htmlraw = nltk.clean_html(htmldata)
    htmltokens = nltk.word_tokenize(htmlraw)
    htmltexts = nltk.Text(htmltokens)
    htmltexts.concordance('gene')

    #Python dan PyScripter
    import os
    os.chdir('path\to\tugas')
    import  tugas
    reload(tugas)
    #NLTK dan Teks
    import nltk
    data = 'Sebuah contoh kalimat yang ingin dianalisis menggunakan NLTK'
    tokens = nltk.word_tokenize(data)
    text = nltk.Text(tokens)
Пример #27
0
import nltk
from nltk.corpus import wordnet as wn

def supergloss(s):
    res = s.definition()
    for hyper in s.hypernyms():
        res += ";" + hyper.definition()
    for hypo in s.hyponyms():
        res += ";" + hypo.definition()
    return res

print supergloss(wn.lemma('car.n.01.automobile').synset())

Пример #28
0
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 17 22:14:04 2015

@author: mongolia19
"""

from nltk.corpus import wordnet as wn
wordAList = wn.synset('thin.a.02')
print wordAList
#keyA = wordAList[0]
wordBList = wn.synsets('fat')
print wordBList
keyB = wordBList[0]
wordCList = wn.synsets('people')
keyC = wordCList[0]

#print keyA
#print keyB
#print keyC

score = wordAList.path_similarity(keyB)
print wn.lemma('fat.a.01.fat').antonyms()
#scoreA = keyC.path_similarity(keyB)
print score
Пример #29
0
def extractFeatureValues(sent, j, usePredictedLabels=True, orders={0,1}, indexer=None,
                         candidatesThisSentence=None):
    '''
    Extracts a map of feature names to values for a particular token in a sentence.
    These can be aggregated to get the feature vector or score for a whole sentence.
    These replicate the features used in Ciaramita and Altun, 2006 
    
    @param sent: the labeled sentence object to extract features from
    @param j: index of the word in the sentence to extract features for
    @param usePredictedLabels: whether to use predicted labels or gold labels (if available) 
    for the previous tag. This only applies to first-order features.
    @param orders: list of orders; e.g. if {1}, only first-order (tag bigram) features will be extracted
    @return: feature name -> value
    '''
    
    (lexiconCandidates, listCandidates), supersenseCandidates = candidatesThisSentence or (({}, {}), [])
    
    ff = IndexedFeatureMap(indexer) if indexer is not None else {}
    
    # note: in the interest of efficiency, we use tuples rather than string concatenation for feature names
    
    # previous label feature (first-order Markov dependency)
    if 1 in orders and hasFirstOrderFeatures() and j>0:
            ff["prevLabel=",(sent[j-1].prediction if usePredictedLabels else sent[j-1].gold)] = 1
    
    if 0 in orders:
        # bias
        ff[()] = 1
        
        
         
        # original token, token position-in-sentence features
        if sent[j].token[0].isupper():
            #ff['capitalized_BOS' if j==0 else 'capitalized_!BOS'] = 1 # old version of feature (in mweFeatures)
            nCap = sum(1 for tkn in sent if tkn.token[0].isupper())
            if j==0:
                ff['capitalized_BOS'] = 1
                if nCap>=(len(sent)-nCap):
                    ff['capitalized_BOS_majcap'] = 1
            else:
                ff['capitalized_!BOS'] = 1
                if nCap>=(len(sent)-nCap):
                    ff['capitalized_!BOS_majcap'] = 1
                if sent[0].token[0].islower():
                    ff['capitalized_!BOS_BOSlower'] = 1
        ff['shape', sent[j].shape] = 1
        if j<2:
            ff['offset_in_sent=',str(j)] = 1
        if len(sent)-j<2:
            ff['offset_in_sent=',str(j-len(sent))] = 1
        
        # lowercased token features
        w = sent[j].token.lower()
        
        # - prefix (up to 4)
        # - suffix (up to 4)
        for k in range(4):
            ff['w[:{}]'.format(k+1), w[:k+1]] = 1
            ff['w[{}:]'.format(-k-1), w[-k-1:]] = 1
        
        # - special characters
        for c in w:
            if c.isdigit():
                ff['has-digit'] = 1
            elif not c.isalpha():
                ff['has-char', c] = 1
        
        # - context word up to 2 away
        # - context POS up to 2 words away
        # - context word bigram
        # - context POS bigram
        # - current lemma and context lemma up to 2 words away, if one of them is a verb 
        #   and the other is a noun, verb, adjective, adverb, preposition, or particle
        for k in range(j-2,j+3):
            if k<0: continue
            elif k>len(sent)-1: break
            ff['w_{:+}'.format(k-j), sent[k].token.lower()] = 1
            ff['pos_{:+}'.format(k-j), sent[k].pos] = 1
            if k!=j and ( \
                    (sent[k].pos[0]=='V' and sent[j].pos[0] in {'V','N','J','I','R','T'}) \
                 or (sent[j].pos[0]=='V' and sent[k].pos[0] in {'V','N','J','I','R','T'})):
                    ff['lemma_+0,{:+}'.format(k-j), sent[j].stem, sent[k].stem] = 1
            if k<j+2 and k<len(sent)-1:
                if useTokenBigrams: ff['w_{:+},{:+}'.format(k-j,k-j+1), sent[k].token.lower(), sent[k+1].token.lower()] = 1
                ff['pos_{:+},{:+}'.format(k-j,k-j+1), sent[k].pos, sent[k+1].pos] = 1
            if clusterMap and (k==j or abs(k-j)==1): # current and neighbor clusters
                clustid, keywords = wordClusterID(sent[k].token.lower())
                ff['c_{:+1}'.format(k-j), clustid, keywords or ''] = 1
                if k!=j:
                    ff['lemma_+0,c_{:+}'.format(k-j), sent[j].stem, clustid, keywords or ''] = 1
        
        # - word + context POS
        # - POS + context word
        if j>0:
            ff['w_+0_pos_-1', sent[j].token.lower(), sent[j-1].pos] = 1
            ff['w_-1_pos_+0', sent[j-1].token.lower(), sent[j].pos] = 1
        if j<len(sent)-1:
            ff['w_+0_pos_+1', sent[j].token.lower(), sent[j+1].pos] = 1
            ff['w_+1_pos_+0', sent[j+1].token.lower(), sent[j].pos] = 1
        
        
        # - auxiliary verb/main verb (new relative to mweFeatures)
        if coarsen(sent[j].pos)=='V':
            cposes = [coarsen(tok.pos) for tok in sent[j:]]
            if len(cposes)>1 and cposes[1]=='V':
                # followed by another verb: probably an aux (though there are exceptions: 
                # "try giving", "all people want is", etc.)
                ff['auxverb'] = 1
            elif len(cposes)>2 and cposes[1]=='R' and cposes[2]=='V':
                # followed by an adverb followed by a verb: probably an aux
                ff['auxverb'] = 1
            else:
                ff['mainverb'] = 1
        
        
        # lexicon features
        
        if not wn.lemmas(sent[j].stem):
            if useWNOOV: ff['OOV',sent[j].pos] = 1
            wn_pos_setS = '{}'
        else:
            wn_pos_set = frozenset({lem.synset().pos().replace('s','a') for lem in wn.lemmas(sent[j].stem)})
            wn_pos_setS = '{'+repr(tuple(wn_pos_set))[1:-1]+'}'
        
        # - WordNet supersense (new relative to mweFeatures)
        extractWNSupersenseFeat(ff, j, supersenseCandidates)
        
        if useWNCompound:
            # - compound
            if sent[j].pos.isalnum():
                prevtok = None
                for tok in sent[j-1::-1]:
                    if tok.pos=='HYPH':
                        continue
                    elif tok.pos.isalnum():
                        prevtok = tok
                    break
                nexttok = None
                for tok in sent[j+1:]:
                    if tok.pos=='HYPH':
                        continue
                    elif tok.pos.isalnum():
                        nexttok = tok
                    break
                
                if sent[j].pos=='HYPH':
                    if isCompound(prevtok,nexttok):
                        ff['compound_left_right'] = 1
                else:
                    if isCompound(prevtok,sent[j]):
                        ff['compound_left'] = 1
                    if isCompound(sent[j],nexttok):
                        ff['compound_right'] = 1
        
        
        nMatches = Counter()
        for lexiconname,segmentation in lexiconCandidates.items():
            toffset,tag,expr_tokens,is_gappy_expr,entry = segmentation[j]
            assert toffset==j
            if lexiconname=='wordnet_mwes':
                if entry:
                    try:
                        mw_pos_set = frozenset(wn.lemma(wnlemma).synset().pos().replace('s','a') for wnlemma in entry["wnlemmas"])
                    except:
                        print(entry, file=sys.stderr)
                        raise
                    mw_pos_setS = '{'+repr(tuple(mw_pos_set))[1:-1]+'}'
                    ff['wn',wn_pos_setS,tag,mw_pos_setS] = 1
                else:
                    ff['wn',wn_pos_setS,tag] = 1
            
            if tag.upper()!='O':
                lbl = entry["label"]
                if not lbl.startswith('NE:') and SENSENUM.search(lbl):
                    lbl = '<sense-tagged>'
                ff['lex',lexiconname,tag.upper(),str(is_gappy_expr),lbl] = 1
                if True or entry["datasource"].lower()!='wikimwe':   # TODO: OK to remove constraint for wikimwe?
                    p1 = sent[expr_tokens[0]].pos
                    p2 = sent[expr_tokens[-1]].pos
                    ff['lex',lexiconname,tag.upper(),str(is_gappy_expr),lbl,p1,'...',p2] = 1
                    nMatches[p1,p2] += 1
                nMatches[None,None] += 1
            else:
                ff['lex',lexiconname,'O'] = 1
            
        if nMatches[None,None]==0:
            ff['#lex-matches=','0'] = 1
        else:
            for n in range(1,nMatches[None,None]+1):
                ff['#lex-matches>=',str(n)] = 1
            for (p1,p2),N in nMatches.items():
                if (p1,p2)!=(None,None):
                    for n in range(1,N+1):
                        ff['#lex-matches',p1,'...',p2,'>=',str(n)] = 1
        
        #sentpos = ''.join(coarsen(w.pos) for w in sent)
        #cposj = coarsen(sent[j].pos)
        
        
        # - collocation extraction lists
        # lists for 6 collocation classes: adj-noun noun-noun preposition-noun verb-noun verb-preposition verb-particle 
        # each list ranks lemma pairs using the t-test.
        # considering each list separately, we segment the sentence preferring higher-ranked items 
        # (requiring lemmas and coarse POSes to match). 
        # fire features indicating (a) B vs. I match, and (b) whether the rank in the top 
        # {25,50,75,100,150,200,300,...,900,1000,2000,...,9000,10k,20k,...90k,100k,200k,...}, 
        # (c) gappiness?
        
        
        for listname,segmentation in listCandidates.items():
            toffset,tag,expr_tokens,is_gappy_expr,entry = segmentation[j]
            assert toffset==j
            
            if tag.upper()!='O':
                lbl = entry["label"]
                is_phrasinator = (entry["datasource"].lower().startswith('phrasinator'))
                ff['list',listname,tag.upper(),str(is_gappy_expr),lbl] = 1
                
                
                p1 = sent[expr_tokens[0]].pos
                p2 = sent[expr_tokens[-1]].pos
                if is_phrasinator:
                    ff['list',listname,tag.upper(),str(is_gappy_expr),lbl,p1,'...',p2] = 1
                r = entry["rank"]
                for t in THRESHOLDS:
                    if r>t: break
                    ff['list',listname,'rank<={}'.format(t), tag.upper(),str(is_gappy_expr),lbl] = 1
                    if is_phrasinator:
                        ff['list',listname,'rank<={}'.format(t), tag.upper(),str(is_gappy_expr),lbl,p1,'...',p2] = 1
                
            else:
                ff['list',listname,'O'] = 1
                
    return ff
Пример #30
0
#!/usr/bin/python
#coding:utf-8

# 2013/02/27

from nltk.corpus import reuters
from nltk.corpus import wordnet as wn

import nltk

wn.synsets('motorcar') # motorcar のSynset (同義語集合) のリスト
wn.synset('car.n.01').lemma_names # 同義語(同義の見出し語)のリスト. 文字列のリスト
wn.synset('car.n.01').definition # 定義文
wn.synset('car.n.01').examples # 例文
wn.synset('car.n.01').lemmas # 同義語集合から全見出し語を抽出 Lemmaのリスト
wn.lemma('car.n.01.automobile') # 特定の見出し語を調べる Lemma
wn.lemma('car.n.01.automobile').synset # 見出し語に対応する同義語集合を取得 Synset
wn.lemma('car.n.01.automobile').name # 見出し語の名前を取得 文字列
wn.synsets('car') # car の同義語集合Synset のリスト
for synset in wn.synsets('car'): # carの同義語集合synsetを1つずつ取り出す
    print synset.lemma_names # synset の同義語のリスト を出力
wn.lemmas('car') # 単語'car'の全同義語集合にアクセス.同義語集合Lemmaのリスト

motorcar = wn.synset('car.n.01') # 特定のSynset
types_of_motorcar = motorcar.hyponyms() # synsetの下位Synsetのリスト
types_of_motorcar[26] # 下位Synset
sorted(lemma.name for synset in types_of_motorcar for lemma in synset.lemmas) # 下位Synsetを1つずつ取り出し, 下位Synsetの見出し語Lemmaのリストから見出し語Lemmaを1つずつ取り出し, Lemmaの名前をソート
motorcar.hypernyms() # 上位Synsetのリスト
paths = motorcar.hypernym_paths() # jou
len(paths)
[synset.name for synset in paths[0]]
Пример #31
0
from nltk.corpus import wordnet as wn

print wn.synsets("motorcar")

print wn.synset("car.n.01").lemma_names

print wn.synset("car.n.01").definition

print wn.synset("car.n.01").examples

print wn.synset("car.n.01").lemmas

print wn.lemma("supply.n.02.supply").antonyms()
Пример #32
0
----------------------------------------------------------------------
""")

from nltk.corpus import wordnet as wn
print(wn.synsets('motorcar'))
print("-" * 40)

print(wn.synset('car.n.01').lemma_names())
print("-" * 40)

print(wn.synset('car.n.01').definition())
print(wn.synset('car.n.01').examples())
print("-" * 40)

print(wn.synset('car.n.01').lemmas())
print(wn.lemma('car.n.01.automobile'))
print(wn.lemma('car.n.01.automobile').synset())
print(wn.lemma('car.n.01.automobile').name())
print("-" * 40)

print(wn.synset('car'))
for synset in wn.synsets('car'):
    print(synset.lemma_names())
print("-" * 40)

print(wn.lemmas('car'))
print("-" * 40)

print(wn.synsets('dish'))
print(wn.synset('dish.n.01').lemma_names())
print(wn.synset('dish.n.01').definition())
Пример #33
0
import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
wn.synsets('cat')
wn.synsets('cat', pos=wn.VERB)
wn.synset('cat.n.01')
print(wn.synset('cat.n.01').definition())
print(len(wn.synset('cat.n.01').examples()))
print(wn.synset('cat.n.01').lemmas())
print([str(lemma.name()) for lemma in wn.synset('cat.n.01').lemmas()])
print(wn.lemma('cat.n.01.cat').synset())

Пример #34
0
def get_antonym(word):

    print "Antonym for: " + word

    if len(word.split()) > 1:
        word = word.replace(" ","_")

    # the slow part
    wnsynset = wn.synsets(word)

    print "WYNSET" + str(wnsynset)
    antonym = None
    # only getting one antonym
    for i in wnsynset:
        for el in i.lemmas():
            x = el.antonyms()
            if len(x) > 0:
                print "Antonym"
                antonym = x[0].name()
                break
    syn_set = []
    if antonym is not None:
        print "synonyms for antonym " + str(antonym)


        if len(antonym.split()) > 1:
            word = antonym.replace(" ","_")

       

        # the slow part
        wnsynset = wn.synsets(antonym)

        print "WYNSET" + str(wnsynset)

        for i in range(0, len(wnsynset)):
            for lemma in wnsynset[i].lemma_names():
                print "LEMMA"
                print lemma
                
                syn_set.append(lemma)


                deriv = wn.lemma(wnsynset[i].name() +"."+ lemma)
                print "DERIVATIONS"
                for x in deriv.derivationally_related_forms():
                    print x.name
                    syn_set.append(x.name())

            print "Hyponym function: " 
            for hypo in wnsynset[i].hyponyms():
                syn_set.append(re.findall(r"[a-zA-Z]*",hypo.name())[0])
                print re.findall(r"[a-zA-Z]*",hypo.name())[0]

            '''
            print "Hypernym function: " 
            for hyper in wnsynset[i].hypernyms():
                syn_set.append(re.findall(r"[a-zA-Z]*",hyper.name())[0])
                print re.findall(r"[a-zA-Z]*",hyper.name())[0]
            '''

    return syn_set
Пример #35
0
import nltk
from nltk.corpus import wordnet as wn

# for each sense of a word, there is a synset with an id consisting of one of the words,
#    whether it is noun, verb, adj or adverb and a number among the synsets of that word
# given word "dog", returns the ids of the synsets
wn.synsets('dog')

# given a synset id, find words/lemma names (the synonyms) of the first noun sense of "dog"
wn.synset('dog.n.01').lemma_names()

# given a synset id, find lemmas of the synset (a lemma pairs a word with a synset)
wn.synset('dog.n.01').lemmas()

# find synset of a lemma
wn.lemma('dog.n.01.domestic_dog').synset()

# find lemma names for all senses of a word
for synset in wn.synsets('dog'):
    print(synset, ":  ", synset.lemma_names())

# find definition of the first noun sense of dog, or namely, the dog.n.01 synset
wn.synset('dog.n.01').definition()

# display an example of the synset
wn.synset('dog.n.01').examples()

# or show the definitions for all the synsets of a word
for synset in wn.synsets('dog'):
    print(synset, ":  ", synset.definition())
Пример #36
0
exit()

for synset in list(wn.all_synsets('n'))[:10]:
    print(synset)


print(wn.synsets('dog', pos=wn.VERB))
print("*"*111)
print(wn.synset('dog.n.01'))
print(wn.synset('dog.n.01').definition())
print(len(wn.synset('dog.n.01').examples()))
print(wn.synset('dog.n.01').examples()[0])
print(wn.synset('dog.n.01').lemmas())
a = [str(lemma.name()) for lemma in wn.synset('dog.n.01').lemmas()]
print(a)
print(wn.lemma('dog.n.01.dog').synset())
print("*"*111)
print(sorted(wn.langs()))
print(wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn'))
print(wn.synset('spy.n.01').lemma_names('jpn'))
print(wn.synset('dog.n.01').lemma_names('ita'))
print("*"*111)

dog = wn.synset('dog.n.01')
print(dog.hypernyms())
print(dog.hyponyms())
print(dog.member_holonyms())
print(dog.root_hypernyms())
print(wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01')))
print("*"*111)
Пример #37
0
def get_semantic_score(word):

    print "STARTING semanticScore for" + word

    if len(word.split()) > 1:
        word = word.replace(" ","_")

    pluralizer = inflect.engine()

    syn_set = []

    # the slow part
    wnsynset = wn.synsets(word)

    print "WYNSET" + str(wnsynset)

    syn_set_final = []
    # not suitable for synonyms but good for relations
    abstractions = []


    for i in range(0, len(wnsynset)):

        
        for lemma in wnsynset[i].lemma_names():
            print "LEMMA"
            print lemma
            
            syn_set.append(lemma)

            
            deriv = wn.lemma(wnsynset[i].name() +"."+ lemma)
        
            print "DERIVATIONS"
            for x in deriv.derivationally_related_forms():
                print x.name()
                syn_set.append(x.name())

    syn_set_b = noDup(syn_set)

    if len(syn_set_b) < 11:
        print "FULL SYNONYMS INCLUDING ABSTRACTIONS"
        print syn_set_b
        
    for i in range(0, len(wnsynset)):
        print "Hypernym function: " 
        for hyper in wnsynset[i].hypernyms():

            # 15 in random - did it for fund to finance
            hyper = re.findall(r"[a-zA-Z]*",hyper.name())[0]
            if len(syn_set_b) > 10:

                abstractions.append(hyper)
            else:
                

                syn_set.append(hyper)
            print hyper
        
        print "Hyponym function: " 
        for hypo in wnsynset[i].hyponyms():
            hypo = re.findall(r"[a-zA-Z]*",hypo.name())[0]
            if len(syn_set_b) > 10:
                abstractions.append(hypo)
            else:
               
                syn_set.append(hypo)
            print hypo
        

        # adds plurals and removes dups
    
    syn_setnodup = noDup(syn_set)
    syn_set_final = []
    for item in syn_setnodup:
        syn_set_final.append(item.lower())
        syn_set_final.append(pluralizer.plural(item).lower())
    

    abstractions = noDup(abstractions)
    abstractions_final = []
    for item in abstractions:
        abstractions_final.append(item.lower())
        abstractions_final.append(pluralizer.plural(item).lower())
    
    uselesswords = ["issues", "issues", "organization", "organizations"]
   
    abstractions_final = [w for w in abstractions_final if w.lower() not in uselesswords]
    syn_set_final = [w for w in syn_set_final if w.lower() not in uselesswords]


    print "END semanticScore"

    return [syn_set_final, abstractions_final]
Пример #38
0
from nltk.corpus import wordnet as wn

res = wn.synset('locomotive.n.01').lemma_names()
print(res)

resdef = wn.synset('ocean.n.01').definition()
print(resdef)

res_exm = wn.synset('good.n.01').examples()
print(res_exm)

res_a = wn.lemma('horizontal.a.01.horizontal').antonyms()
print(res_a)
Пример #39
0
def lemma(name_synsets):
    return wordnet.lemma(name_synsets)
Пример #40
0
    for synset in tqdm(list(wn.all_synsets())):
        synset_set.add(synset)
        for item in synset.hypernyms():
            hypernyms.append((to_str(synset), to_str(item)))
        for item in synset.hyponyms():
            hyponyms.append((to_str(synset), to_str(item)))
        for item in synset.member_holonyms():
            member_holonyms.append((to_str(synset), to_str(item)))

        # lemma_set
        for item in synset.lemmas():
            lemma_set.add(to_str(item))

    # lemma edge
    for lemma in tqdm(lemma_set):
        lemma = wn.lemma(lemma)
        for item in lemma.derivationally_related_forms():
            derivationally_related_forms.append((to_str(lemma), to_str(item)))
        for item in lemma.pertainyms():
            pertainyms.append((to_str(lemma), to_str(item)))
        for item in lemma.antonyms():
            antonyms.append((to_str(lemma), to_str(item)))

    # node
    lemmas = []
    for item in lemma_set:
        lemmas.append(item)
    # lemmas = [to_str(it) for it in lemma_set]
    synsets = [to_str(it) for it in synset_set]
    words = list(wn.all_lemma_names())
Пример #41
0
def lemma(name_synsets):
	return wordnet.lemma(name_synsets)
Пример #42
0
dog[0].definition()

dog[0].examples()
type(dog[0].examples())
len(dog[0].examples())

## LEMMA: represent a specifuc sense o a specific word
## =====
## ---------------------------------------------------

dog[0].lemmas()
type(dog[0].lemmas())
len(dog[0].lemmas())
for w in dog[0].lemmas():
    print(w)

wn.lemma('dog.n.01.dog')
wn.lemma('dog.n.01.dog').synset()

#######################################################

print(wn.synset('dog.n.02'.definition()))  # doesn't exist

for i in wn.synsets('dog'):
    print(i)

print(wn.synset('frump.n.01').definition())
for i in wn.synsets('dog'):
    print(i.definition())
Пример #43
0
def readFile():
    input_file = open(
        "C:\\Users\\Sergio\\Dropbox\\QMUL\\Data\\choicesNHS\\nhsChoices.txt",
        "r")
    #input_file = open("C:\\Users\\Sergio\\Dropbox\\QMUL\\Data\\choicesNHS\\nhsChoicesDiagnosis.txt", "r")
    #input_file = open("C:\\Users\\Sergio\\Dropbox\\QMUL\\Data\\choicesNHS\\nhsChoicesDiabetesWhole.txt", "r")
    lines = input_file.readlines()
    input_file.close()

    annotationsX = []
    annotationsSLR = []
    annotationsNER = []

    for x in lines:

        annotationX = x
        annotationSLR = annotator.getAnnotations(x, dep_parse=True)['srl']
        #annotationNER = annotator.getAnnotations(x,dep_parse=True)['ner']
        annotationsX.append(annotationX)
        annotationsSLR.append(annotationSLR)
        #annotationsNER.append(annotationNER)

    size = len(annotationsSLR)
    print size

    A0 = 0
    A1 = 0
    pbroles = []
    annotationsA0 = []
    annotationsA1 = []

    for an in range(5):
        print annotationsX[an]
        print annotationsSLR[an]
        sizeIn = len(annotationsSLR[an])
        #print sizeIn
        for an2 in range(sizeIn):

            print "--------------------------------------------------------------------------------------------------------"

            print annotationsSLR[an][an2]["V"]
            w = Word(annotationsSLR[an][an2]["V"]).lemmatize("v")
            #print w
            #print wn.synset(w+'.v.01')

            try:
                for role in propbank.roleset(w + '.01').findall("roles/role"):
                    print(role.attrib['f'], role.attrib['n'],
                          role.attrib['descr'])
                    pbroles.append(role.attrib['descr'])
                #for role in propbank.roleset(w+'.01').findall("aliases/alias"):
                #print(role.attrib['framenet'], role.attrib['pos'], role.attrib['verbnet'])
            except:
                pass

            try:
                print(
                    wn.lemma(w + '.v.01.' + w).derivationally_related_forms())
            except:
                pass

            if "A0" in annotationsSLR[an][an2]:
                print annotationsSLR[an][an2]["A0"]
                A0 = annotationsSLR[an][an2]["A0"]
                #try:
                #A0 = TextBlob(A0, np_extractor=extractor)
                #A0 = A0.noun_phrases[0]
                #print A0
                #except:
                #pass
                try:
                    annotationsA0 = WordNet.spotlightSearch(A0)
                    annotationsA0 = annotationsA0[0].get('URI')
                except:
                    annotationsA0 = "unknown"
                    pass

            if "A1" in annotationsSLR[an][an2]:
                print annotationsSLR[an][an2]["A1"]
                A1 = annotationsSLR[an][an2]["A1"]
                #try:
                #A1 = TextBlob(A1, np_extractor=extractor)
                #A1 = A1.noun_phrases[0]
                #print A1
                #except:
                #pass
                try:
                    annotationsA1 = WordNet.spotlightSearch(A1)
                    annotationsA1 = annotationsA1[0].get('URI')
                except:
                    annotationsA1 = "unknown"
                    pass

            print pbroles

            print "--------------------------------------------------------------------------------------------------------"

            CreateGraphNeo4J.createGraph(w, A0, A1, pbroles, annotationsA0,
                                         annotationsA1)
            del pbroles[:]
            annotationsA0 = []
            annotationsA1 = []
            A0 = 0
            A1 = 0
Пример #44
0
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

motorcar = wn.synsets('motorcar')
print('synsets that motorcar belongs to: ' + repr(motorcar))
cars = wn.synset('car.n.01')
print('synset of car sense 1: ' + str(cars))

print('car sense 1 lemma names: ' + repr(cars.lemma_names()))
print('car sense 1 definition: ' + cars.definition())
print('car sense 1 example sentences: ' + repr(cars.examples()))
car_lemmas = cars.lemmas()
print('car sense 1 lemmas: ' + repr(car_lemmas))

automobile = wn.lemma('car.n.01.automobile')
print('synset of automobile (car sense 1): ' + str(automobile.synset()))
print('name of the automobile lemma: ' + automobile.name())

all_noun_synsets = wn.all_synsets('n')
print('number of noun synsets: ' + str(len(list(all_noun_synsets))))

car_synsets = wn.synsets('car')
print('synsets that car belongs to: ' + repr(car_synsets))
for synset in car_synsets:
	print(str(synset) + ' ' + repr(synset.lemma_names()))

print('synsets in which car is a lemma: ' +  repr(wn.lemmas('car')))

motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()
Пример #45
0
		for l in s.lemmas():
			ls.add(l)
	for s in lemma.synset().substance_meronyms():
		for l in s.lemmas():
			ls.add(l)

	return ls


def generate_fp(lemma, lemmas):
	fp = set()
	ls = get_related_lemmas(lemma, lemmas)

	for l in ls:
		set_bit(fp, lemmas, l)

	return fp


if __name__ == "__main__":
	lemmas, lemmas_list = read_lemmas('vocab_lemmas.txt')

	N = len(lemmas)

	with open('fingerprints.txt', 'w') as f:
		for lookup in lemmas_list:
			lemma = wn.lemma(lookup)
			print lemmas[lookup], lemma
			fp = generate_fp(lemma, lemmas)
			f.write(lookup + ":" + str(fp) + "\n")
Пример #46
0
languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
for i in [139, 140, 141, 142]:
    print(swadesh.entries(languages)[i])

from nltk.corpus import toolbox
toolbox.entries('rotokas.dic')

from nltk.corpus import wordnet as wn
wn.synsets('motorcar')
wn.synset('car.n.01').lemma_names()

wn.synset('car.n.01').definition()
wn.synset('car.n.01').examples()

wn.synset('car.n.01').lemmas()
wn.lemma('car.n.01.automobile')
wn.lemma('car.n.01.automobile').synset()
wn.lemma('car.n.01.automobile').name()

wn.synsets('car')
for synset in wn.synsets('car'):
    print(synset.lemma_names())
wn.lemmas('car')

motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()
types_of_motorcar[0]
sorted(lemma.name() for synset in types_of_motorcar
       for lemma in synset.lemmas())

motorcar.hypernyms()
# In[118]:

wn.synset('car.n.01').definition()

# In[119]:

wn.synset('car.n.01').examples()

# In[120]:

wn.synset('car.n.01').lemmas()

# In[123]:

print(wn.lemma('car.n.01.automobile'))
print(wn.lemma('car.n.01.automobile').synset())
print(wn.lemma('car.n.01.automobile').name())

# In[124]:

# the word car is ambiguous, having five synsets:
wn.synsets('car')

# In[125]:

for synset in wn.synsets('car'):
    print(synset.lemma_names())

# In[126]:
Пример #48
0
#!/usr/bin/python
#===================================================================
# This codelet reads the vocabulary lemmas and verifies that each
# is found in NLTK WordNet.  Some lemmas in WordNet cannot be looked
# up because of parsing errors due to dots (.) in the lemma name.
# Copyright 2014, IEEE ENCS Humanoid Robot Project
#===================================================================

from nltk.corpus import wordnet as wn

with open('vocab_lemmas.txt', 'r') as f:
	for line in f:
		try:
			wn.lemma(line.strip()) # will blow up if line isn't a lemma
		except:
			print line.strip()


Пример #49
0
#three relations: part, substance, member

print wn.synset('tree.n.01').part_meronyms()
#burl, crown, stump, trunk, limb is part of tree
print wn.synset('tree.n.01').substance_meronyms()
#heartwood and sapwood is substance of tree
print wn.synset('forest.n.01').member_meronyms()
#the member of forest is tree

print wn.synset('trunk.n.01').part_holonyms()
#tree
print wn.synset('heartwood.n.01').substance_holonyms()
#tree
print wn.synset('tree.n.01').member_holonyms()
#forest

for synset in wn.synsets('mint', wn.NOUN):
    print synset.name() + ':', synset.definition()

#batch.n.02: (often followed by `of') a large number or amount or extent
#mint.n.02: any north temperate plant of the genus Mentha with aromatic leaves and small mauve flowers
#mint.n.03: any member of the mint family of plants
#mint.n.04: the leaves of a mint plant used fresh or candied
#mint.n.05: a candy that is flavored with a mint oil
#mint.n.06: a plant where money is coined by authority of the government

print wn.synset('eat.v.01').entailments()
#eat entails swallow and chew
print wn.lemma('supply.n.02.supply').antonyms()
#supply vs demand