def wsd(summpath, wsdpath): summpathlist = os.listdir(summpath) for idp,dirpath in enumerate(summpathlist[0:1]): print(idp) ''' idp=0 dirpath=summpathlist[idp] ''' comppath = summpath+dirpath+'/' comppathlist = os.listdir(comppath) for idc,compathdir in enumerate(comppathlist): ''' idc=0 compathdir=comppathlist[idc] ''' fpath = comppath+compathdir f = open(fpath,'r+',encoding='utf-8') text = f.readlines() text = [re.sub(r'^\s+|\s+$','',x) for x in text if len(x)>3] nertext = [] sentencewsd=copy.deepcopy(text) for ids,sentence in enumerate(text): ''' ids=0 sentence=text[ids] ''' word = sentence.split() word = [re.sub(r'[\,]','',x) for x in word] word = [re.sub(r'(?<=\w)[\.]','',x) for x in word] ner = st.tag(word) ner = [x[0] for x in ner if x[1]=='O'] ner = [x for x in ner if x not in stops] sentence = ' '.join(ner) nertext.append(sentence) for zipf in zipf_freq: ''' zipf=zipf_freq[0] ''' textwsd = [] for ids,sentence in enumerate(nertext): ''' ids=0 sentence=text[ids] ''' ambiguity = disambiguate(sentence, adapted_lesk, keepLemmas=True, zipf=zipf) for idy,syn in enumerate(ambiguity): ''' idy=3 syn=ambiguity[idy] ''' if syn[2] is not None: syn_lemma = syn[2].lemma_names() syn_lemma = [[zipf_frequency(x, 'en'),x] for x in syn_lemma ] syn_lemma = sorted(syn_lemma , reverse=True) if syn_lemma[0][0]==0: syn_lemma = [[len(x[1]),x[1]] for x in syn_lemma] syn_lemma = sorted(syn_lemma , reverse=False) if(lemmatize(syn[0].lower())!=syn_lemma[0][1]): sentencewsd[ids] = re.sub(r''+syn[0],syn_lemma[0][1],sentencewsd[ids] ) textwsd.append(sentencewsd[ids]) outDirectory = wsdpath+dirpath+'/' if not os.path.exists(outDirectory): os.makedirs(outDirectory) fout = open(outDirectory+str(zipf)+'-'+compathdir,'w',encoding='utf-8') fout.writelines(textwsd) fout.flush() fout.close()
# Copyright (C) 2014-2015 alvations # URL: # For license information, see LICENSE.md from __future__ import print_function import time from nltk.corpus import brown from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk from pywsd.allwords_wsd import disambiguate print("======== TESTING all-words lesk (`from_cache=True`)===========") start = time.time() for sentence in brown.sents()[:10]: sentence = " ".join(sentence) disambiguate(sentence, simple_lesk, prefersNone=True, keepLemmas=True) disambiguate(sentence, original_lesk) disambiguate(sentence, adapted_lesk, keepLemmas=True) print('Disambiguating 100 brown sentences took {} secs'.format(time.time() - start)) print("======== TESTING all-words lesk (`from_cache=False`)===========") start = time.time() for sentence in brown.sents()[:10]: sentence = " ".join(sentence) disambiguate(sentence, simple_lesk, prefersNone=True, keepLemmas=True, from_cache=False) disambiguate(sentence, original_lesk, from_cache=False) disambiguate(sentence, adapted_lesk, keepLemmas=True, from_cache=False) print('Disambiguating 10 brown sentences took {} secs'.format(time.time() - start))
from nltk import word_tokenize from nltk.corpus import wordnet as wn from nltk.corpus import brown, stopwords from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk from pywsd.similarity import max_similarity from pywsd.utils import lemmatize from pywsd.allwords_wsd import disambiguate print "======== TESTING all-words lesk ===========\n" for sentence in brown.sents()[:10]: # Retrieves a tokenized text from brown corpus. sentence = " ".join(sentence) # Annotate the full sentence. print disambiguate(sentence, simple_lesk, prefersNone=True, keepLemmas=True) print disambiguate(sentence, original_lesk) print disambiguate(sentence, adapted_lesk, keepLemmas=True) print disambiguate(sentence, cosine_lesk, prefersNone=True) print print print "======== TESTING all-words path maxsim ===========\n" print "This is going to take some time, have some coffee...\n" for sentence in brown.sents()[0:1]: # Retrieves a tokenized text from brown corpus. sentence = " ".join(sentence) # Annotate the full sentence. print disambiguate(sentence, max_similarity, similarity_option='path') print disambiguate(sentence, max_similarity, similarity_option='wup') print
from nltk import word_tokenize from nltk.corpus import brown, stopwords from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk from pywsd.similarity import max_similarity from pywsd.utils import lemmatize from pywsd.allwords_wsd import disambiguate print("======== TESTING all-words lesk ===========\n") for sentence in brown.sents()[:10]: # Retrieves a tokenized text from brown corpus. sentence = " ".join(sentence) # Annotate the full sentence. print(disambiguate(sentence, simple_lesk, prefersNone=True, keepLemmas=True)) print(disambiguate(sentence, original_lesk)) print(disambiguate(sentence, adapted_lesk, keepLemmas=True)) print(disambiguate(sentence, cosine_lesk, prefersNone=True)) print() print() print("======== TESTING all-words path maxsim ===========\n") print("This is going to take some time, have some coffee...\n") for sentence in brown.sents()[0:1]: # Retrieves a tokenized text from brown corpus. sentence = " ".join(sentence) # Annotate the full sentence. print(disambiguate(sentence, max_similarity, similarity_option='path')) print(disambiguate(sentence, max_similarity, similarity_option='wup')) print()
from string import punctuation from nltk import word_tokenize, pos_tag from nltk.corpus import wordnet as wn from nltk.corpus import brown, stopwords from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk from pywsd.similarity import max_similarity from pywsd.utils import lemmatize, penn2morphy from pywsd.allwords_wsd import disambiguate """ This module is to test for consistency between using the dismabiguate() and individually calling wsd functions. """ for sentence in brown.sents()[:100]: # Retrieves a tokenized text from brown corpus. sentence = " ".join(sentence) # Uses POS info when WSD-ing. _, poss = zip(*pos_tag(word_tokenize(sentence))) tagged_sent = disambiguate(sentence, prefersNone=True, keepLemmas=True) for word_lemma_semtag, pos in zip(tagged_sent, poss): word, lemma, semtag = word_lemma_semtag if semtag is not None: # Changes POS to morphy POS pos = penn2morphy(pos, returnNone=True) # WSD on lemma assert simple_lesk(sentence, lemma, pos=pos) == semtag
# For license information, see LICENSE.md from string import punctuation from nltk import word_tokenize, pos_tag from nltk.corpus import wordnet as wn from nltk.corpus import brown, stopwords from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk from pywsd.similarity import max_similarity from pywsd.utils import lemmatize, penn2morphy from pywsd.allwords_wsd import disambiguate """ This module is to test for consistency between using the dismabiguate() and individually calling wsd functions. """ for sentence in brown.sents()[:100]: # Retrieves a tokenized text from brown corpus. sentence = " ".join(sentence) # Uses POS info when WSD-ing. _, poss = zip(*pos_tag(word_tokenize(sentence))) tagged_sent = disambiguate(sentence, prefersNone=True, keepLemmas=True) for word_lemma_semtag, pos in zip(tagged_sent, poss): word, lemma, semtag = word_lemma_semtag if semtag is not None: # Changes POS to morphy POS pos = penn2morphy(pos, returnNone=True) # WSD on lemma assert simple_lesk(sentence, lemma, pos=pos) == semtag
# URL: # For license information, see LICENSE.md from __future__ import print_function import time from nltk.corpus import brown from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk from pywsd.allwords_wsd import disambiguate print("======== TESTING all-words lesk (`from_cache=True`)===========") start = time.time() for sentence in brown.sents()[:10]: sentence = " ".join(sentence) disambiguate(sentence, simple_lesk, prefersNone=True, keepLemmas=True) disambiguate(sentence, original_lesk) disambiguate(sentence, adapted_lesk, keepLemmas=True) print('Disambiguating 100 brown sentences took {} secs'.format(time.time() - start)) print("======== TESTING all-words lesk (`from_cache=False`)===========") start = time.time() for sentence in brown.sents()[:10]: sentence = " ".join(sentence) disambiguate(sentence, simple_lesk, prefersNone=True, keepLemmas=True, from_cache=False) disambiguate(sentence, original_lesk, from_cache=False)