def normalize_word(word, words=None, stemmer=None, lemmatizer=None): '''normalize_word to its stem form, if not a dictionary word, return '' ''' # initialize once global WORDS, STEMMER, LEMMATIZER if words is None: if WORDS is None: WORDS = wordnet.words() words = WORDS if stemmer is None: if STEMMER is None: STEMMER = LancasterStemmer() stemmer = STEMMER if lemmatizer is None: if LEMMATIZER is None: LEMMATIZER = WordNetLemmatizer() lemmatizer = LEMMATIZER if word in words: return word temp = lemmatizer.lemmatize(word) if temp in words: return temp temp = lemmatizer.lemmatize(word, pos='v') if temp in wordnet.words(): return temp temp = stemmer.stem(word) if temp in wordnet.words(): return temp return ''
def setup(): try: all_words = wn.words() print("have wordnet, all is well") except LookupError: print("don't have wordnet, downloading") nltk.download('wordnet') all_words = wn.words() global WORDS WORDS = [w for w in all_words if w[0].isalpha() and "_" not in w]
def lemmatize_if_needed(self, t): if t in wn.words() and wn.synsets(t)[0].pos() == 'n': logging.debug('found in wn : {}'.format(t)) return t if t[-3:] == 'ies': t = t[:-3] + 'y' if t[-1:] == 's': t = t[:-1] if t in wn.words() and wn.synsets(t)[0].pos() == 'n': logging.debug('found in wn : {}'.format(t)) return t return None
def exercise4(): # The majority of WordNet's senses are marked by four POS categories: noun, verb, adjective, and adverb. # Determine the percentage of words from the WordNet corpus that have senses in more than one of these categories. # For example, type has senses which connect to both "noun" and "verb" POS (positive case), # whereas typewriter has only senses which connect to "noun" POS (negative case) # get all words in WordNet wn_words = [w for w in wn.words()] #list of words wn_text = nltk.Text(word.lower() for word in wn_words) # plain text #print wn_text pos_tags = nltk.pos_tag(wn_text) # tag of word (word, tag) # convert to WordNet's senses wn_pos_tags = [] for pos_tag in pos_tags: if pos_tag[1].startswith('J'): wn_pos_tags.append((pos_tag[0], 'ADJ')) if pos_tag[1].startswith('V'): wn_pos_tags.append((pos_tag[0], 'VERB')) if pos_tag[1].startswith('N'): wn_pos_tags.append((pos_tag[0], 'NOUN')) if pos_tag[1].startswith('R'): wn_pos_tags.append((pos_tag[0], 'ADV')) count = 0 data = nltk.ConditionalFreqDist((word, tag) for (word, tag) in wn_pos_tags) for word in data.conditions(): if len(data[word]) > 1: # more than one sense count = count + 1 tags = data[word].keys() print word, ' '.join(tags) print("The percentage of words that have more than one senses", count / len(wn_pos_tags))
def main(args): target = wn.synset(args.target) print('target:', args.target) words = wn.words() nouns = set([]) for word in words: nouns.update(wn.synsets(word, pos='n')) print(len(nouns), 'nouns') hypernyms = [] for noun in nouns: paths = noun.hypernym_paths() for path in paths: try: pos = path.index(target) for i in range(pos, len(path) - 1): hypernyms.append((noun, path[i])) except Exception: continue hypernyms = list(set(hypernyms)) print(len(hypernyms), 'hypernyms') if not args.shuffle: random.shuffle(hypernyms) with open(args.result_file, 'w') as fout: for n1, n2 in hypernyms: print(n1.name(), n2.name(), sep=args.sep, file=fout)
def checkgibberish_words(userInput): ''' This function checks if all the words of the input sentence is a valid English word. ''' #Remove punctuation #make translator object translator=str.maketrans('','',string.punctuation) userInput=userInput.translate(translator) #Remove digits from the input userInput_NoDigits = re.sub(r'\d+', '', userInput) wnl = WordNetLemmatizer() gibberishWord = False for word in userInput_NoDigits.split(): chkWord=word in words.words() if not chkWord: chkWordNet = word in wordnet.words() if not chkWordNet: lemma = wnl.lemmatize(word, 'n') #WordNet and Word dictionary does not include plural words, #hence checking for plural words through WordNetLemmatizer plural = True if word is not lemma else False if not plural: gibberishWord = True return gibberishWord
def getWords(): str = textSearch.get() sLen = searchLen.get() if sLen: sLen = int(sLen) else: sLen = 0 listWords = [] compRegx = re.compile('^[{}]+$'.format(str)) for w in wordnet.words(): match = compRegx.search(w) if match and compRepeat(str, w): if sLen == 0: listWords.append(match.group()) elif len(match.group()) == sLen: listWords.append(match.group()) listBox1.delete(0, listBox1.size()) i = 1 for w in listWords: listBox1.insert(i, "-> {}".format(w)) i += 1 listBox1.pack()
def build_word_data(): print('***** Build words.\n') words = [] for word in wn.words(): words.append(word) max_length_word = get_longest_item(words) dump_data_to_pickle(WORDS_PATHS['pkl'], words, max_length_word)
def gen_signature(word): """Generate a signature for each candidate expansion, using contextual information from the Brown corpus, as well as WordNet definitions and examples (if applicable).""" if word in gen_signature.dict: return gen_signature.dict[word] inds = find_matches(word) if len(inds) > 50: f = len(inds) / 50 inds = [inds[int(i * f)] for i in range(50)] signature = defaultdict(int) for i in inds: for w in gen_context(i, brown): signature[w] += 1 sig = {w for w in signature if signature[w] > 1 and w not in stopwords.words('english') and w != ','} if word in wn.words(): if wn.synsets(word) and str(wn.synsets(word)[0]).count("'") == 2: define = (eval("wn.{}.definition()".format( str(wn.synsets(word)[0]).lower()))) examples = (eval("wn.{}.examples()".format( str(wn.synsets(word)[0]).lower()))) if examples: for ex in examples: sig.update([w for w in wt(ex) if w not in stopwords.words('english')]) if define: sig.update([w for w in wt(define) if w not in stopwords.words('english')]) gen_signature.dict[word] = sig return sig
def get_words(self, n=500): ''' Method to get all the words from wordnet and return them as a list ''' # used set to make sure the words are unique return list(set(list(wn.words())))[:n]
def generate_network(word, network=defaultdict(set)): print("building network for word '%s' subtree..." % word) logger.info("building network for word '%s' subtree..." % word) words, target = wn.words(), wn.synset('%s.n.01' % word) targets = set(open('data/%s_dependencies.txt' % word).read().split('\n')) nouns = { noun for word in words for noun in wn.synsets(word, pos='n') if noun.name() in targets } for noun in nouns: for path in noun.hypernym_paths(): if target not in path: continue for i in range(path.index(target), len(path) - 1): if not path[i].name() in targets: continue network[noun.name()].add(path[i].name()) with open('data/%s_network.csv' % word, 'w') as out: nb_vertex = len(network) for key, vals in network.items(): for val in vals: out.write(key.split('.')[0] + ',' + val.split('.')[0] + '\n') nb_links = len(pd.read_csv('data/%s_network.csv' % word)) print('Builded network of %s vertexes and %s links for word %s' % (nb_vertex, nb_links, word))
def analyse_definition(): from nltk.corpus import wordnet as wn from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() all_words = set(_ for _ in wn.words()) all_defs = set() with progressbar.ProgressBar(max_value=len(list(all_words))) as bar: for idx, word in enumerate(all_words): synsets = wn.synsets(word) definitions = ' '.join([_.definition() for _ in synsets]) definitions = definitions.replace(";", "") defwords = [ defw for defw in definitions.split(' ') if defw in all_words ] defwords += [ lemmatizer.lemmatize(defw) for defw in definitions.split(' ') if defw not in all_words and lemmatizer.lemmatize(defw) in all_words ] all_defs.update(defwords) bar.update(idx) print("Full voc:", len(list(all_words))) print("Full def:", len(list(all_defs)))
def __init__(self): words = list(set(i for i in wn.words())) counter = Counter() self.max_len = 0 for word in words: counter.update([word]) word = wn.synsets(word) for meaning in word: definition = re.sub(r'\([^)]*\)', '', meaning.definition()) if len(definition) == 0: continue if definition[0] == ' ': definition = definition[1:] self.max_len = max(self.max_len, len(definition.split(' '))) counter.update(definition.split(' ')) self.vocab = Vocab(counter, specials=('<unk>', '<pad>', '<sos>', '<eos>')) self.vocab_len = len(self.vocab) self.meanings = [] out_counter = Counter() for word in words: if counter[word] > 3: out_counter.update([word]) self.meanings.extend([(word, i.definition()) for i in wn.synsets(word)]) self.out_vocab = Vocab(out_counter, specials=('<unk>', '<pad>', '<sos>', '<eos>')) self.out_vocab_len = len(self.out_vocab)
def __init__(self): self.words = list(set(i for i in wn.words())) counter = Counter() for word in self.words: counter.update([word]) word = wn.synsets(word) for meaning in word: counter.update(meaning.definition().split(' ')) self.vocab = Vocab(counter)
def get_n_length_words(n): words = list(wn.words()) nLength = set() for word in words: i = count_underscore(word) if len(word) - i == n: word = word.replace('_', '') nLength.add(word.upper()) return sorted(nLength)
def is_english_word(word): """ This function is used to set the the key AKA the mono alphabetic list Return : array of integers """ setofnetwords = set(wordnet.words()) if word in setofnetwords: return True else: return False
def eng(self): words = list(set(i for i in wn.words())) out_words = [] for word in words: meanings = wn.synsets(word) word = word.replace('_', ' ') for meaning in meanings: out_words.append((word, meaning.definition())) indices = list(range(len(out_words))) return out_words, indices
def __init__(self): self.tag = random.sample(list(wordnet.words()),1)[0] self.intensity = random.random() self.sentiment = random.random()*2 - 1 self.lifetimesecs = random.random() * (EvolvingRandomTag.MAX_LIFETIME-1) + 1 self.ttl = self.lifetimesecs self.t_start = time.time() self.t_last = self.t_start self.intensitystep = EvolvingRandomTag.MAX_STEP_INTENSITY # max step per second self.sentimentstep = EvolvingRandomTag.MAX_STEP_SENTIMENT
def construct_wn_thesauri(self, thesauri_dir): """ 1. T <--- extract all wordnet words 2. for a word in T, find its wordnet synonyms and antonyms, save them in two sets 3. for all words and their corresponding sets, save them in two dicts 4. save two dicts as npy files if a word doesn't appear in the dict, it will return an empty set :param thesauri_dir: pos to save syn_dict and ant_dict :return: None """ # key: word, value: syn_words(set) syn_dict = defaultdict(set) # key: word, value: ant_words(set) ant_dict = defaultdict(set) # get synonyms from wordnet # TODO: configure this in the future # since a word has multiple meanings we combine all the synonyms as its synonyms # this can be configured in the future # and all the antonyms are its antonyms for word in tqdm(wn.words()): synonyms, antonyms = [], [] for syn in wn.synsets(word): for lemma in syn.lemmas(): synonyms.append(lemma.name()) antonyms.append( [ant_lemma.name() for ant_lemma in lemma.antonyms()]) # add word's synonyms syn_dict[word].update(synonyms) # remove itself in syn set if word in syn_dict[word]: syn_dict[word].remove(word) # add word's antonyms ant_dict[word].update(chain.from_iterable(antonyms)) # remove itself in ant set if word in ant_dict[word]: ant_dict[word].remove(word) # since a word t may have multiple meanings, a same word w could be its synonym and antonym # for this case, remove w in t's synonym syn_dict[word] = syn_dict[word] - ant_dict[word] # CAUTION: original wordnet is asymmetric on synonyms and antonyms # make syn_dict symmetric for synonym in syn_dict[word]: syn_dict[synonym].add(word) # make ant_dict symmetric for antonym in ant_dict[word]: ant_dict[antonym].add(word) # save synonym/antonym dict np.save(join(thesauri_dir, 'syn_dict.npy'), syn_dict) np.save(join(thesauri_dir, 'ant_dict.npy'), ant_dict)
def gen_data(network=defaultdict(set)): words, target = wn.words(), wn.synset('mammal.n.01') targets = set(open('data/targets.txt').read().split('\n')) nouns = {noun for word in words for noun in wn.synsets(word,pos='n') if noun.name() in targets} for noun in nouns: for path in noun.hypernym_paths(): if not target in path: continue for i in range(path.index(target),len(path)-1): if not path[i].name() in targets: continue network[noun.name()].add(path[i].name()) with open('data/mammal_subtree.tsv','w') as out: for key,vals in network.iteritems(): for val in vals: out.write(key+'\t'+val+'\n')
def crawl_wordnet(self, corenlp_url): corenlp = StanfordCoreNLP(corenlp_url) for i, word in enumerate(wordnet.words()): if word in self._data: logger.info('skip a known word {}'.format(word)) continue self._data[word] = [] for synset in wordnet.synsets(word): def_ = corenlp.tokenize(synset.definition())[0] self._data[word].append(def_) if i % 10000 == 0: self.save() self.save()
def structure_to_sdr(self, synset_structure_with_idx): con = sqlite3.connect("data/words_sdr.db") con.execute("CREATE TABLE sdrtable (word, sdr)") sql = ''' INSERT INTO sdrtable(word, sdr) VALUES(?,?) ''' for word in wn.words(): cursor = con.cursor() sdr = ' '.join( str(idx) for idx in self.word_to_sdr(word, synset_structure_with_idx)) cursor.execute(sql, (word, sdr)) con.commit() con.close()
def get_prepared_words() -> List[str]: # init nltk.download('words') nltk.download('stopwords') nltk.download('wordnet') # prepare prepared_words = [] r = requests.get( 'https://raw.githubusercontent.com/dwyl/english-words/master/words_dictionary.json' ) dwyl_words = list(r.json().keys()) ahmadly_words = list( pickle.load( urlopen( "https://github.com/jojoee/WordCookiesCheat/blob/master/all_words.pickle?raw=true" ))) wordnet_words = list(wordnet.words()) # proceed prepared_words = stopwords.words() + words.words( ) + wordnet_words + ahmadly_words prepared_words = [ word.lower() for word in prepared_words if word.isalpha() ] prepared_words = list(set(prepared_words)) # sort prepared_words.sort() # debug print("stopwords size: %d" % len(stopwords.words())) print("words size: %d" % len(words.words())) print("wordnet size: %d" % len(list(wordnet.words()))) print("dwyl_words size: %d" % len(dwyl_words)) print("prepared_words size: %d" % len(prepared_words)) return prepared_words
def _scores_over_sample(self): words = set(wn.words()) sample = set(random.sample(words, self.sample_size)) for word1 in sample: for word2 in (sample - set([word1])): self.compute_overlap(word1, word2) self.path_similarity(word1, word2) # self.lch_similarity(word1, word2) self.wup_similarity(word1, word2) # self.res_similarity(word1, word2) # self.jcn_similarity(word1, word2) # self.lin_similarity(word1, word2) print(f"{word1} and {word2} done") sample = sample - set([word1])
def __init__(self, in_terms=[], threshold=0.025, allow_multiple=False, rseed=1, num_rands=10): self.threshold = threshold self.synset_counts = {} self.synset_set = set() self.preceding_words = set(['']) self.succeeding_words = set(['']) self.matched_terms = set() self.skip_set = set() self.max_length = 0 self.allow_multiple = allow_multiple if rseed > 0: random.seed(rseed) words = [] for i in range(num_rands): word = random.sample(list(wn.words()), 1)[0] while wn.synsets(word)[0].pos() != 'n': word = random.sample(list(wn.words()), 1)[0] print('word {}'.format(word)) words += [word] self.get_synsets_recursively(wn.synsets(words[0])) self.overfrequent_synsets = self.skip_set.copy() print('overfrequent synsets : {}'.format(self.overfrequent_synsets)) for word in words[1:]: self.skip_set = set() self.get_synsets_recursively(wn.synsets(word)) print('skip_set synsets : {}'.format(self.skip_set)) self.overfrequent_synsets &= self.skip_set print('overfrequent synsets : {}'.format(self.overfrequent_synsets)) for t in in_terms: if not self.add_training_term(t): logging.debug('no wn vocab found :: {}'.format(t)) self.make_set()
def findWordNetRhymes(word): w = d(word.lower())[0] res = [] for ss in wn.words(): if d(ss)[0] == w: res.append(ss) if len(res) < 1: word = word[1:] if len(word) > 1: print("Going one level further") return findWordNetRhymes(word) else: return res
def main(result_file, shuffle, sep): words = wn.words() nouns = set([]) for word in words: nouns.update(wn.synsets(word, pos='n')) print( len(nouns), 'nouns') hypernyms = list(transitive_closure(nouns)) print( len(hypernyms), 'hypernyms' ) if not shuffle: random.shuffle(hypernyms) with open(result_file, 'w') as fout: for n1, n2 in hypernyms: print(n1.name(), n2.name(), sep=sep, file=fout)
def __init__(self): self.nlp = English() self.model = Word2Vec.load_word2vec_format(self.model_file, binary=True) self.new_words.extend(list(self.extra_words.values())) self.sw = set(stopwords.words('english')) self.sw.update(webcolors.CSS3_NAMES_TO_HEX) # remove colors self.sw.update(self.lorem) self.sw.update(self.extra_stopwords) for w in self.non_stopwords: self.sw.discard(w) self.english_vocab = set(w.lower() for w in words.words('en')) self.english_vocab.update(wordnet.words('eng')) self.english_vocab.update(self.new_words) self.word_set = set(self.model.index2word) self.num_features = 300
def create_Wordnet_set(): '''The purpose of this function is to create a set of all words from the wordnet dictionary. Input = None Output = Set object of all words. ''' # Import words from wordnet from nltk.corpus import wordnet as wn Words = wn.words() # Create List to capture words List_dict_words = [] [List_dict_words.append(x) for x in Words] # Create Set Set_dict_words = set(List_dict_words) # Return Set return Set_dict_words
def get_wordnet_subset(self): ''' Getting the subset of wordnet for a specific language. Using the fastText model, this function associate to each synonym a similarity score with the processed one. A dictionary is return with language word as key and the corresponding synonym dict list as value. Each dictionary in the list has as key the synonym itself and the similarity is the value :return: ''' # Loading of the model self.emb_model = self.load_fastText_model() words = [word.replace('_', ' ') for word in wn.words(lang=self.LANG)] syns = {} for w in words: syns[w] = [ synset.lemma_names(self.LANG) for synset in wn.synsets(w, lang=self.LANG) ] synonyms = {} for word in syns.keys(): syn_list = {} for synset in syns[word]: for word_syn in synset: if word.lower() == word_syn.lower(): continue if self.word_in_syn_list(syn_list, word_syn): continue if math.isnan( float(self.get_word_similarity(word, word_syn))): continue syn_list[word_syn.replace('_', ' ')] = float( self.get_word_similarity(word, word_syn)) synonyms[word] = syn_list return synonyms
def train(tests): #files = os.listdir(c_dir) #print files #for f in files: c_dir = "/home/such/Documents/NLP/Programming_Assignment_resources/" fpath = c_dir+'big.txt' #stemmer = WordNetStemmer() #stem = lambda x:stemmer.stem(x) stops = stopwords.words('english') doc = open(fpath,'r') words = re.findall('[a-z]+' , doc.read().lower()) #words2 = [ w for w in words if w not in stops] #define n-gram model from the file stored using pickel #corp = pickle.load(open('corpfile')) corp = map(lambda x :x.lower(), brown.words()) ispresent = lambda x : wn.words(x) != [] #corp = filter(ispresent, corp) #corp_words = re.findall('[a-z]+' , corp) #corp = [ w for w in corp if w not in stops] corp_dict = offset_dict(corp) for s in tests: print s test_words = re.findall('[a-z]+' , s.lower()) # Finding the possible misspelled words misspelled = [] sentences = s.split('.') #mispos = {} #sentences = [s for s in sentences if s not in stops] for t_word in test_words : if t_word.lower() not in words and t_word.lower() != '': misspelled.append(t_word.lower()) #mispos[w.lower()] = s.index(w) #print mispos #finding the candidate words for words in the misspelled array candidates = {} for wrong in misspelled: #pos = s.index(wrong) candidates[wrong] = (list(edit_distances.correct(wrong))) #find the context words for the test sentences and the corpus corrections = {} for miss in misspelled: print test_words #find the context words for the mispelled words error_dict = offset_dict(test_words) error_context = list(set(concord(error_dict,test_words,miss))) error_context = [e for e in error_context if e not in stops] errcont = [] for errc in error_context: errcont += list(set(concord(corp_dict,corp,errc))) errcont = filter(ispresent,errcont) errcont = [e for e in errcont if e not in stops] errcont += error_context # #print "error context" #print error_context #print errcont #for each context word find how often they co-occur with each of the corrections counts = {} can_list = candidates[miss] #print can_list for c in can_list: cand_cooccur = list(set(concord(corp_dict,corp,c))) #change the corpus here #cand_cooccur = filter(lambda x: edit_dist(c,miss) < 2, cand_cooccur) cand_cooccur = filter(ispresent,cand_cooccur) cand_cooccur = [ca for ca in cand_cooccur if ca not in stops] #print "printing candidate context for" + c +".....................\n\n\n\n" #print "candidate contexts for "+c #print cand_cooccur count = sum([cand_cooccur.count(i) for i in errcont]) counts[c] = count,sim(errcont,c) print counts corrections[miss] = max(counts,key = lambda a:counts.get(a)) p = test_words.index(miss) test_words[p] = max(counts,key = lambda a:counts.get(a)) #Suggest the corrections print "misspelled :" + miss +"\n" try: print "correction :" + corrections[miss] + "\n\n" except ValueError: pass
def train(words): tagging_model = dict() training_model = dict() for word,tag in words: try: tagging_model[word].add(tag) except: tagging_model[word] = set([tag]) try: training_model[word] += 1 except: training_model[word] = 1 return training_model, tagging_model known_good_words = list(wordnet.words()) known_words, known_tagged_words = train(brown.tagged_words()) def edit_distance_one(word): set_of_words = set() splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] inserts = set([a + c + b for a, b in splits for c in alphabet]) set_of_words = set_of_words.union(inserts) deletes = set([a + b[1:] for a, b in splits if b!=[]]) set_of_words = set_of_words.union(deletes) trans = set([a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]) set_of_words = set_of_words.union(trans)
matrix.append([net_words.index(str(word)),net_words.index(ant_word),-1]) matrix.append([net_words.index(ant_word),net_words.index(str(word)),-1]) if __name__ == "__main__": result_word=open('/Users/Jane/Documents/Python/result_word.txt','w') result_matrix=open('/Users/Jane/Documents/Python/result_matrix.txt','w') net_words=[] synsets=[] antonyms=[] counter=0 matrix=[] word_list=list(wn.words()) random.shuffle(word_list) for word in word_list: if(counter<25000): get_synsets(word,synsets,net_words) counter+=1 for item in net_words: result_word.write(item) result_word.write('\n') result_word.close() for item in matrix: result_matrix.writelines(str(item)) result_matrix.write('\n') result_matrix.close() print len(net_words)