def semcor_sentences(self, labeled=True, original_sense=False): sentences = [] for s in semcor.tagged_sents(tag="both"): triplets = [] for w in s: sense = w.label() if original_sense: triplets += [(sense, w, self.clean_pos(w, p)) for (w, p) in w.pos()] else: triplets += [(self.check_sense(sense=sense, word=w, tag=self.clean_pos(w, p)), w, self.clean_pos(w, p)) for (w, p) in w.pos()] senses, words, tags = zip(*triplets) if labeled: sentence = Sentence(words=words, pos_tags=tags, senses=self.clean_labels(senses)) #for testing else: sentence = Sentence(words=words, pos_tags=tags) sentences.append(sentence) return sentences
def generate_semcor_data(filename): # Exclude the files containing verb only annotations print("Loading semcor data from nltk, excluding verb only annotation.") file_ids = list(filter(lambda k: 'brownv' not in k, semcor.fileids())) tagged_sents = semcor.tagged_sents(fileids=file_ids, tag='both') sent_words = [] # Sentences sent_labels = [] # Super-Sense Labels print("Writing semcor raw data to {}...".format(filename)) i = 0 f = open(filename, 'w') for sent in tagged_sents: i += 1 words = [] labels = [] for tag in sent: _words = tag.leaves() _supersenses = get_supersenses(_words, tag.label()) words = np.append(words, _words) labels = np.append(labels, _supersenses) sys.stdout.write('\rWrote {} examples.\r'.format(i)) sys.stdout.flush() sentence = " ".join(words.flatten()) tags = " ".join(labels.flatten()) f.write("{} <||> {}\n".format(sentence, tags)) f.close()
def parse(self): tagged_sents = semcor.tagged_sents(tag='sense') sents = semcor.sents() # tagged_sents returns senses of each word/group of words for sent, tag in zip(sents, tagged_sents): word_idx = 0 for entry in tag: # check for no sense tag or multiword entries # TODO is it ok to exclude multiword entries? entry_len = len(entry.leaves()) if entry.label() and entry_len == 1 and type( entry.label()) != str: #import pdb; pdb.set_trace() entry = entry.label().synset().name().split('.') if len(entry) == 3: # check for (word.pos.nn) entry word, pos, sense = entry num_senses = self.count_senses(word) context = self.get_context(sent, word_idx) new_ex = Example(context, word, self.parse_sense(sense), pos, num_senses) # add to data set self.data.append(new_ex) # TODO for now just take first sense found in sentence break word_idx += entry_len # one entry might be multiple words
def CollectSemcorSupersenses(): oracle_matrix = collections.defaultdict(WordSupersenses) for sent in semcor.tagged_sents(tag='both'): for chk in sent: if chk.node and len(chk.node)>3 and chk.node[-3]=='.' and chk.node[-2:].isdigit(): if chk[0].node.startswith('N'): pos = "n" elif chk[0].node.startswith('V'): pos = "v" else: continue lemmas = chk.node[:-3] wnsn = int(chk.node[-2:]) ssets = wn.synsets(lemmas, pos) sorted_ssets = sorted(ssets, key=lambda x: x.name) filtered_ssets = None for lemma in lemmas.split("_"): if not filtered_ssets or len(filtered_ssets) == 0: filtered_ssets = filter(lambda x: lemma in x.name, sorted_ssets) if filtered_ssets and len(filtered_ssets) > 0: sorted_ssets = filtered_ssets try: supersense = sorted_ssets[wnsn-1].lexname # prints 'noun.group except: #print("."), continue for lemma in lemmas.split("_"): ssets = wn.synsets(lemma, pos) if len(ssets) > 0: if lemma.isdigit(): lemma = "0" oracle_matrix[lemma].Add(supersense, "semcor") return oracle_matrix
def pickleDataSet(filestr): try : file = open(filestr,'wb') filetext = open(filestr+'.txt','wb') filemap = open(filestr+'.map','wb') sents=semcor.tagged_sents(tag='both') ps = PorterStemmer() datalist=[] wordaddressmap = {} def addWordAddress(word, linenum): if word not in wordaddressmap: wordaddressmap[word]=set([]) st = wordaddressmap[word] st.add(linenum) def getWordAdresses(word): if word in wordaddressmap: return wordaddressmap[word] return None for i,s in enumerate(sents): sentence,sentencedata=getFeaturesInSentence(s,ps,debugSentIndex=i) datalist.append(sentencedata) filetext.write(sentence+'\n') for word in nltk.word_tokenize(sentence): addWordAddress(word,i) pickle.dump(datalist,file) pickle.dump(wordaddressmap,filemap) except pickle.PicklingError, pe: print e
def __init__(self, **kwargs): self._sents = [] self._tagged_sents = [] self._semcor_file_ids = self._load_semcor_file_ids() self._processor = kwargs.get('processor', lambda lexeme, definition, examples: (lexeme, definition, examples)) for file_id in self._semcor_file_ids: self._sents.append(semcor.sents(file_id)) self._tagged_sents.append(semcor.tagged_sents(file_id, 'both'))
def extract_semcor_sentences(count=50, start=0): semcor_sentences = [] # Estraggo la frase dal SemCor sottoforma di chunks, ognuno dei # quali con il proprio PoS for i, sentence in enumerate(semcor.tagged_sents(tag='both')[start:]): #Cerco i sostantivi nella frase nn_words = get_nn_words(sentence) if len(nn_words) > 0: semcor_sentences.append((i + start, sentence, nn_words)) if len(semcor_sentences) == count: break return semcor_sentences
def get_semcor_sentences(data_size): sentences, senses = [], [] for index in range(0, data_size): for node in semcor.tagged_sents(tag='both')[index]: node_noun = None # If node is a noun if isinstance(node.label(), Lemma) and node[0].label() == 'NN': node_noun = node break if node_noun: senses.append(node) sentences.append(" ".join(semcor.sents()[index])) return sentences, senses
def prepareDataSet(): sents=[[re.split('\(',str(c)) for c in s] for s in semcor.tagged_sents(tag='both')[:10]] ps = PorterStemmer() siz = int(len(sents)*0.9) trainList=[] testList=[] for s in sents[:siz]: trainList.extend(getFeaturesInSentence(s,ps)) for s in sents[siz:]: testList.extend(getFeaturesInSentence(s,ps)) train_set = nltk.classify.util.apply_features(extract_features, trainList,labeled=False) test_set = nltk.classify.util.apply_features(extract_features, testList,labeled=False) return train_set, test_set
def semcor_extraction() -> tuple: sentences = [] extracted = [] for i in range(0, 10): elem = list( filter( lambda sentence_tree: isinstance(sentence_tree.label(), Lemma) and sentence_tree[0].label() == "NN", semcor.tagged_sents(tag='both')[i])) if elem: extracted.append(random.choice(elem)) sentences.append(" ".join(semcor.sents()[i])) return sentences, extracted
def establich_signature(self): signatures = {} for s in semcor.tagged_sents(tag='both'): words = [] for tree in s: if tree.label().__class__.__name__ == 'Lemma': words.append(tree.label().name()) words = set(words) print(words) for tree in s: if tree.label().__class__.__name__ == 'Lemma': if tree.label().synset().name() not in signatures.keys(): signatures[tree.label().synset().name()] = set() signatures[tree.label().synset().name()] |= words with open('signatures.pickle', 'wb') as f: pickle.dump(signatures, f)
def create_semcor_data_files(length): print("loading semcor...") sentences = semcor.chunk_sents() senses = [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:length]] with open('data/sense.pkl', 'wb') as outfile: pickle.dump(senses, outfile, pickle.HIGHEST_PROTOCOL) print("semcor loaded") if length != -1: return sentences[:length] else: return sentences
def semcor_extraction(sentence_number=50): sentences = [] extracted = [] for i in range(0, sentence_number): # Estraiamo i nomi dalla frase i nouns = list(filter(lambda sentence_tree: isinstance(sentence_tree.label(), Lemma) and sentence_tree[0].label() == "NN", semcor.tagged_sents(tag='both')[i])) # Scegliamo un nome a caso della frase dalla lista nouns e lo estraiamo dalla frase i if nouns: lemma = select_lemma(nouns).label() extracted.append(lemma) sentence = " ".join(semcor.sents()[i]) sentences.append(remove_word(sentence, lemma.name())) return sentences, extracted
def loadSemcorSections(self): """ Loads semcor sections into two lists one of just senteces and one of tagged sentences. Returns: A dictionary with keys 'chunks' and 'sentences' with values of a list tagged semcor sentences and a list of untagged semcor sentences. """ sentencesGroupedBySense = defaultdict(list) listOfFileIds = semcor.fileids() listOfChunks = [] listOfSentences = [] for fileId in listOfFileIds: listOfChunks.append(semcor.tagged_sents(fileId, 'both')) listOfSentences.append(semcor.sents(fileId)) listOfChunks = self.removeLevelsOfListWithinList(listOfChunks) listOfSentences = self.removeLevelsOfListWithinList(listOfSentences) semcorData = {'chunks':listOfChunks, 'sentences':listOfSentences} return semcorData
def load(word): train_instances = [] for sentence in semcor.tagged_sents(tag='sem')[:]: context = get_context(sentence) for el in sentence: if type(el) is Tree: # type(el) is Tree lemm = ' '.join(el.leaves()) if word != None and lemm != word: continue try: golden_key = el.label().key() except AttributeError: continue one_instance = custom_WSDInstance(lemm, context, golden_key) train_instances.append(one_instance) return train_instances
def semcor_sentences(self, labeled=True, original_sense=False): sentences = [] for s in semcor.tagged_sents(tag="both"): triplets = [] for w in s: sense = w.label() if original_sense: triplets += [(sense, w, self.clean_pos(w, p)) for (w,p) in w.pos()] else: triplets += [(self.check_sense(sense=sense, word=w, tag=self.clean_pos(w, p)), w, self.clean_pos(w, p)) for (w,p) in w.pos()] senses, words, tags = zip(*triplets) if labeled: sentence = Sentence(words=words, pos_tags=tags, senses=self.clean_labels(senses)) #for testing else: sentence = Sentence(words=words, pos_tags=tags) sentences.append(sentence) return sentences
def semcor_extraction(sentence_number: int = 50) -> tuple: """ Extracts `sentence_number` sentences from the semcore corpus. From each of them extracts also a random noun. :return: Returns a tuple (extracted sentences list, extracted nouns list) """ sentences = [] extracted = [] for i in range(0, sentence_number): elem = list( filter( lambda sentence_tree: isinstance(sentence_tree.label(), Lemma) and sentence_tree[0].label() == "NN", semcor.tagged_sents(tag='both')[i])) if elem: extracted.append(random.choice(elem)) sentences.append(" ".join(semcor.sents()[i])) return sentences, extracted
def process_semcor(): print 'semcor' from nltk.corpus import semcor count = 0 word = 'bank' sen1 = 'depository_financial_institution.n.01' sen2 = 'bank.n.01' file_name = 'data/bank_semcor_labelled_tmp.txt' for f in semcor.fileids(): sents = semcor.sents(f) tsents = semcor.tagged_sents(f, 'sem') for i in range(len(sents)): sent = sents[i] if (word in sent): if (sen1 in str(tsents[i])): appendToFile(file_name, sentToStr(sent, '+')) elif (sen2 in str(tsents[i])): appendToFile(file_name, sentToStr(sent, '-')) else: appendToFile(file_name, sentToStr(sent, '0')) count = count + 1 print count
def CollectSemcorSupersenses(): oracle_matrix = collections.defaultdict(WordSupersenses) for sent in semcor.tagged_sents(tag='both'): for chk in sent: if chk.node and len(chk.node) > 3 and chk.node[ -3] == '.' and chk.node[-2:].isdigit(): if chk[0].node.startswith('N'): pos = "n" elif chk[0].node.startswith('V'): pos = "v" else: continue lemmas = chk.node[:-3] wnsn = int(chk.node[-2:]) ssets = wn.synsets(lemmas, pos) sorted_ssets = sorted(ssets, key=lambda x: x.name) filtered_ssets = None for lemma in lemmas.split("_"): if not filtered_ssets or len(filtered_ssets) == 0: filtered_ssets = filter(lambda x: lemma in x.name, sorted_ssets) if filtered_ssets and len(filtered_ssets) > 0: sorted_ssets = filtered_ssets try: supersense = sorted_ssets[wnsn - 1].lexname # prints 'noun.group except: #print("."), continue for lemma in lemmas.split("_"): ssets = wn.synsets(lemma, pos) if len(ssets) > 0: if lemma.isdigit(): lemma = "0" oracle_matrix[lemma].Add(supersense, "semcor") return oracle_matrix
#creating a table containing the results results = PrettyTable() results.add_column("Original Sentences", original_sentences) results.add_column("Ambiguous Word", words_to_analyze) results.add_column("Choosen Synset", choosen_synsets) results.add_column("New Sentence", new_sentences) #write the table to a file table_txt = results.get_string() with open('./output/Output Word Disambiguation.txt', 'w') as file: file.write(table_txt) #----------------------------------------------------- SEMCOR TESTING -------------------------------------------# #getting already semantically tagged sentences from semcor corpus sem_tagged_sentences = sc.tagged_sents(tag='sem')[1:50] #getting the same sentences untagged semcor_sentences = sc.sents()[1:50] #getting already pos tagged sentences from semcor corpus pos_tagged_sentences = sc.tagged_sents(tag='pos')[1:50] semtest_results = ts.semcorDisambiguation(sem_tagged_sentences, semcor_sentences, pos_tagged_sentences) semtest_sample = semtest_results[0][1:10] table_txt_semcor = semtest_results[0].get_string() with open('./output/Output Semcor Testing.txt', 'w') as file: file.write(table_txt_semcor + "\r\n") file.write("Accuracy: " + str(semtest_results[1]) + "%") sample_text = semtest_sample[0].get_string() #Su 50 prove la media di accuratezza è stata del 41,32%
## Extract sentences for different senses of lemmas specified below from SemCor targets = (("capital", "n"), ("interest", "n"), ("motion", "n"), ("plant", "n"), ("space", "n"), ("suit", "n"), ("tank", "n"), ("vessel", "n")) # natural ambiguous nouns from Schuetze (1998) ## "ruling" (2nd sense = verb gerund) and "train" (noun vs. verbs) have been excluded targets += (("bank", "n"), ("hand", "n"), ("room", "n")) # some other interesting ambiguous nouns targets += (("find", "v"), ("grasp", "v"), ("open", "v"), ("run", "v")) # try some verbs import sys import nltk from nltk.corpus import semcor, wordnet from nltk.stem.wordnet import WordNetLemmatizer wnl = WordNetLemmatizer() files = [id for id in semcor.fileids() if not id.startswith("brownv")] # files = "brown1/tagfiles/br-j05.xml" # for testing sents_sem = semcor.tagged_sents(fileids=files, tag="sem") sents_pos = semcor.tagged_sents(fileids=files, tag="pos") # sents_sem = sents_sem[10:13] # for testing # sents_pos = sents_pos[10:13] pos2wn = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV} print("id\tsid\ttarget\tpos\tsense\tgloss\tsentence\thw\tlemma") s_num = 0 item_num = {t: 0 for t in targets} for s_sem, s_pos in zip(sents_sem, sents_pos): s_num += 1 lemmas = [x.label() for x in s_sem if isinstance(x, nltk.tree.Tree)] # annotated WordNet lemmas (= synset.hw) lemmas = [x for x in lemmas if isinstance(x, nltk.corpus.reader.wordnet.Lemma)] # skip entries where sense isn't a Lemma object
try: from mpi4py import MPI comm = MPI.COMM_WORLD except ImportError: comm = None if feature: if feature == 'ngram': featurevocab = ngram_vocab(n) with open(corpusfile, 'r') as f: matrix, featurecounts, wordcounts = cooc_matrix((line.split() for line in f), featurevocab, vocab, n=n, unk=UNK, verbose=True, comm=comm) elif feature == 'synset': featurevocab = synset_vocab() matrix, featurecounts, wordcounts = cooc_matrix(semcor.sents(), featurevocab, vocab, doc2wnd=synset_context(iter(semcor.tagged_sents(tag='sem'))), unk=UNK, interval=100, verbose=True, wndo2=None) featurevocab = [synset.name() for synset in featurevocab] else: raise(NotImplementedError) if comm is None or not comm.rank: sp.save_npz(outputroot+'.npz', matrix) with open(outputroot+'.pkl', 'wb') as f: pickle.dump({'words': vocab, feature+'s': featurevocab, 'wordcounts': wordcounts, feature+'counts': featurecounts}, f) else: sys.exit() else: with open(corpusfile, 'r') as f: matrix, counts = symmetric_cooc_matrix((line.split() for line in f), vocab, unk=UNK, verbose=True, comm=comm)
l_hypos = [] for hypo_synset in synset.hyponyms(): word = hypo_synset.name().split('.')[0] if word not in l_hypos: l_hypos.append(word) if l_hypos: random_index = random.randint(0, len(l_hypos) - 1) new_sentence.append(l_hypos[random_index]) else: for w in word: new_sentence.append(w) print (' '.join(new_sentence)) if __name__ == "__main__": args = parse_command_line() l_sentence = semcor.sents()[args.index] sentence = ' '.join(l_sentence) print (sentence) s = semcor.tagged_sents(tag='sem')[args.index] # random.seed(a=0) if args.nym == 'synonym': print_synonym_sentence(s) elif args.nym == 'hypernym': print_hypernym_sentence(s) elif args.nym == 'hyponym': print_hyponym_sentence(s)
def process_semcor(ref_dict): ''' Return a DataFrame that contrains sentences along with citations and information of detected heteronyms ''' sents = semcor.sents() tagged_sents = semcor.tagged_sents(tag='sem') sense_list = list(ref_dict['sense']) semcor_sents = pd.DataFrame(columns=['sentence', 'citation', 'heteronym']) word_duplicate_sense = set(ref_dict[ref_dict.duplicated(['sense' ])]['word']) for sent_idx, sent in enumerate(tagged_sents): het_in_sent = [] for token_idx, token in enumerate(sent): if type(token) == nltk.Tree: lemma = token.label() chunk = token.leaves() ## Check whether token is a heteronym if (type(lemma) == nltk.corpus.reader.wordnet.Lemma) and ( lemma.synset() in sense_list) and (len(chunk) == 1): synset = lemma.synset() word = chunk[0] ## Take care of sense-duplcated heteronyms (rare), ## e.g. project and projects can have same sense but different pronunciations. if word.lower() in word_duplicate_sense: pron = list(ref_dict[(ref_dict['word'] == word.lower()) & (ref_dict['sense'] == synset)] ['pronunciation']) if pron: het_in_sent.append((word.lower(), synset, pron[0])) ## If sense if not duplicated, mapping to pron is one-to-one else: pron = list(ref_dict[ref_dict['sense'] == synset] ['pronunciation'])[0] word_in_ref = list( ref_dict[ref_dict['sense'] == synset]['word'])[0] if word.lower() == word_in_ref: het_in_sent.append((word_in_ref, synset, pron)) if het_in_sent: new_row = { 'sentence': "".join([ " " + i if not i.startswith("'") and i not in string.punctuation else i for i in sents[sent_idx] ]).strip(), 'citation': 'SemCor', 'heteronym': het_in_sent } semcor_sents = semcor_sents.append(new_row, ignore_index=True) return semcor_sents
return 'Noun' elif 'VB' in posToConvert: return 'Verb' elif posToConvert == 'RB': return 'Adverb' elif posToConvert == 'JJ': return 'Adjective' semcorFileIds = semcor.fileids() count = 0 wordSynsetCount = defaultdict(int) wordCount = defaultdict(int) for fileID in semcorFileIds: for tagged_sent in semcor.tagged_sents(fileID, 'both'): for tree in tagged_sent: if type(tree.label()) is Lemma: synset = tree.label().synset() for wordTuple in tree.pos(): wordPoS = convertPoS(wordTuple[1]) wordLowercase = wordTuple[0].lower() wordCount[wordLowercase + " " + wordPoS] += 1 wordSynsets = wn.synsets(wordTuple[0]) if synset in wordSynsets: wordSynsetCount[wordLowercase + " " + str(synset)] += 1 with open('semcorWordFreqCount', 'wb') as f: pickle.dump(wordCount, f) with open('semcorWordSenseCount', 'wb') as f:
def main2(filename): # csv fieldnames fieldnames = [ 'word', 'lemma', 'pos', 'is_homonym', 'wn_synset', 'ws_meaning', 'confidence', 'nsenses' ] print('Reading {}...'.format(filename)) outfile = open(os.path.join(OUTDIR, filename + '.tsv'), 'w') writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t') writer.writeheader() for i, sent in enumerate(semcor.tagged_sents(tag='both')): if i % 100 == 0: print('{} sentences read'.format(i)) for t in sent: if type(t) == str: # no lemma continue word = t.flatten()[0].lower() pos = t.pos()[0][1] if pos != 'NN' or not pos.startswith('V'): # t is not a noun or verb continue lemma = t.label().name() wn_synset = t.label().synset() nsenses = len(wn.synsets(word)) def_bag = set([stem(w.lower()) for w in wn_synset.definition()]) urls = homonym_urls(lemma) if urls == []: row['is_homonym'] = False #row['ws_meaning'] = 1 #row['confidence'] = 1 else: if pos == 'N': pos_ = 'noun' elif pos.startswith('V'): pos_ = 'verb' meaning_bags = [meaning_bag(pos_, url) for url in urls] confidence = [] # list of overlap size between bags for mb in meaning_bags: confidence.append(len(mb & def_bag)) row['is_homonym'] = True row['ws_meaning'] = confidence.index(max(confidence)) row['confidence'] = '|'.join(str(x) for x in confidence) row['word'] = word row['lemma'] = lemma row['pos'] = pos row['wn_synset'] = wn_synset row['nsenses'] = nsenses writer.writerow(row) outfile.close()
if 'NN' in posToConvert: return 'Noun' elif 'VB' in posToConvert: return 'Verb' elif posToConvert == 'RB': return 'Adverb' elif posToConvert == 'JJ': return 'Adjective' semcorFileIds = semcor.fileids() count = 0 wordSynsetCount = defaultdict(int) wordCount = defaultdict(int) for fileID in semcorFileIds: for tagged_sent in semcor.tagged_sents(fileID, 'both'): for tree in tagged_sent: if type(tree.label()) is Lemma: synset = tree.label().synset() for wordTuple in tree.pos(): wordPoS = convertPoS(wordTuple[1]) wordLowercase = wordTuple[0].lower() wordCount[wordLowercase + " " + wordPoS] += 1 wordSynsets = wn.synsets(wordTuple[0]) if synset in wordSynsets: wordSynsetCount[wordLowercase + " " + str(synset)] += 1 with open('semcorWordFreqCount', 'wb') as f: pickle.dump(wordCount, f) with open('semcorWordSenseCount', 'wb') as f:
from nltk.corpus import wordnet as wn from nltk.corpus import semcor wn_lemmas = set() for lemma in wn.all_lemma_names(pos=wn.ADJ): wn_lemmas.add(lemma) wn_adj_synsets = collections.defaultdict(set) for word in wn_lemmas: for synset in wn.synsets(word, wn.ADJ): wn_adj_synsets[synset.name.lower()] = [lemma.lower() for lemma in synset.lemma_names ] semcor_adjectives = set() i = 0 for sent in semcor.tagged_sents(tag='both'): for c,chk in enumerate(sent): if chk.node and len(chk.node)>3 and chk.node[-3]=='.' and chk.node[-2:].isdigit() and chk[0].node.startswith('JJ'): if len(chk.leaves()) == 1: semcor_adjectives.add(chk.leaves()[0].lower()) semcor_synsets = set() for s, words in wn_adj_synsets.items(): for w in words: if w in semcor_adjectives: semcor_synsets.add(s.lower()) vectors = set() vector_adj_file = open("data/VSM/eacl14-faruqui-en-svd-de-64.adj", "w") for line in open("data/VSM/eacl14-faruqui-en-svd-de-64.adj.txt"):
elif ematrix[i][j] == 'SUB': seq.append(str(cmatrix[i][j] - cmatrix[i - 1][j - 1])) seq.append('SUB') i -= 1 j -= 1 elif ematrix[i][j] == 'INS': seq.append(str(cmatrix[i][j] - cmatrix[i][j - 1])) seq.append('INS') j -= 1 elif ematrix[i][j] == 'DEL': seq.append(str(cmatrix[i][j] - cmatrix[i - 1][j])) seq.append('DEL') i -= 1 seq = ' '.join(reversed(seq)) print(seq) if __name__ == '__main__': #...TODO Parse arguments and load semcor sentences args = parse_command_line() l_sentence1 = semcor.sents()[args.untagged] l_sentence2 = semcor.sents()[args.tagged] # TODO print sentence1 and sentence2 print(' '.join(l_sentence1)) print(' '.join(l_sentence2)) s2 = semcor.tagged_sents(tag='sem')[args.tagged] wordnet_edit_distance(l_sentence1, s2, args.sim)
line = line.strip() if line != '': sent.append(line.split('\t')) else: if len(sent) > 0: supsense_sentences.append(sent) sent = [] c += 1 print(supsense_sentences) print(c) # print(semcor.words()) # print(semcor.chunks()) i = 0 semcor_sents = semcor.tagged_sents(tag='both') # print(semcor_sents) with open('semcor_all.conll', 'w') as f: for sent in semcor_sents: if i == len(supsense_sentences): break ref_sent = supsense_sentences[i] for j, ch in enumerate(sent): # print(ch) # f.write(str(ch) + '\n') rt = ch.label() sense = 'O' if not isinstance(ch[0], str): sense = rt if isinstance(ch[0][0], str): pos = ch[0].label()
from nltk.corpus import conll2000, conll2002 print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2000.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2002.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # SEMCOR from nltk.corpus import semcor print(semcor.words()) print(semcor.chunks()) print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE list(map(str, semcor.tagged_chunks(tag='both')[:3])) [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]] # IEER from nltk.corpus import ieer ieer.fileids() # doctest: +NORMALIZE_WHITESPACE docs = ieer.parsed_docs('APW_19980314') print(docs[0]) print(docs[0].docno) print(docs[0].doctype) print(docs[0].date_time) print(docs[0].headline) print(docs[0].text) # doctest: +ELLIPSIS
import nltk from nltk.tree import Tree from nltk.corpus.reader.wordnet import Lemma from nltk.corpus import semcor from nltk.corpus import wordnet noun = set(['NN', 'NNS', 'NNP', 'NNPS']) verb = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']) adjective = set(['JJ', 'JJR', 'JJS']) adverb = set(['RB', 'RBR', 'RBS']) substantive = noun | verb | adjective | adverb corp = semcor.sents() tags = semcor.tagged_sents(tag = 'sem') n = 0 correct = 0 base = 0 total = 0 for sent in corp: sentence = ' '.join(sent) print sentence parsed = list(parser.parse(tokenizer.tokenize(sentence)))
word = str(sentence_chunk[0]) descriptor = str(sentence_chunk.label()) if word in lemmas: lemmas[word][descriptor] = lemmas[word][descriptor] + 1 if descriptor in lemmas[word] else 1 else: lemmas[word] = {descriptor: 1} # this else statement prevents keyerror lookups on lemmas[word][synset] return word print("Importing Lemma and Synsets") lemmas = dict() # lemmas is a dict of dict, # lemmas[word] = dictionary of { synsets:frequency of synset when associated with a 'word' } # lemmas[word][synset] is a count of how many times a synset appears for each word # *** len(lemmas[word]) = the number of different senses a 'word' has in the corpus taggedsentences = semcor.tagged_sents(tag='both') # all sentences, fully tagged from SEMCOR plaintextsentences = semcor.sents() # all sentences from SEMCOR targetsentences = {} # sentences containing 'point' pos = dict() # list of part of speech tags from the corpus max_sentence_len = 0 lemmacount = {} # find all sentences including exactly 1 occurence of 'back' # not all of these sentences are related to the synsets we are looking for # e.g. goes back relates to the verb go instead of back for i, s in enumerate(plaintextsentences) :
seq.append(str(cmatrix[i][j] - cmatrix[i - 1][j - 1])) seq.append('SUB') i -= 1 j -= 1 elif ematrix[i][j] == 'INS': seq.append(str(cmatrix[i][j] - cmatrix[i][j - 1])) seq.append('INS') j -= 1 elif ematrix[i][j] == 'DEL': seq.append(str(cmatrix[i][j] - cmatrix[i - 1][j])) seq.append('DEL') i -= 1 seq = ' '.join(reversed(seq)) print(seq) if __name__ == '__main__': #...TODO Parse arguments and load semcor sentences args = parse_command_line() l_sentence1 = semcor.sents()[args.index1] l_sentence2 = semcor.sents()[args.index2] # TODO print sentence1 and sentence2 print(' '.join(l_sentence1)) print(' '.join(l_sentence2)) s1 = semcor.tagged_sents(tag='sem')[args.index1] s2 = semcor.tagged_sents(tag='sem')[args.index2] wordnet_edit_distance(s1, s2, args.sim)