def make_grammatical_poem(verses, N=10, grammar=kjvdict): ''' Make a poem N lines long that attempts to be grammatically correct given a list of verses and a "grammar" which is a frequency distribution of POS bigrams. The verses are assumed to be already tokenized. ''' initial = random.choice(verses) poem = [initial] i = 0 while i < N: initial_tags = nltk.pos_tag(initial) final_tag = initial_tags[-1][1] next_tag = weighted_choice(grammar.get(final_tag, 'NN')) random.shuffle(verses) for j in range(len(verses)): next_verse = verses[j] if nltk.pos_tag(next_verse)[0][1] == next_tag: poem.append(next_verse) initial = next_verse i += 1 break else: print "Could not find matching lines to finish poem." return poem
def colocation(windowSize, pos, context,dictionary): if windowSize<=0: return dictionary #going forward forward= context[:(pos)] f= forward[(-windowSize/2):] #going backward backward= context[pos+1:] b= backward[:windowSize/2] for item in f: key= "pre"+str(len(f)-f.index(item))+"-word" value= item dictionary[key]=value key= "pre"+str(len(f)-f.index(item))+"-pos" text = nltk.word_tokenize(item) value= nltk.pos_tag(text)[0][1] dictionary[key]=value for item in b: key= "fol"+str(b.index(item)+1)+"-word" value= item dictionary[key]=value key= "fol"+str(b.index(item)+1)+"-pos" text = nltk.word_tokenize(item) value= nltk.pos_tag(text)[0][1] dictionary[key]=value return dictionary
def update(self, other): """Adds counts for elements in other""" if isinstance(other, self.__class__): self.n_sents += other.n_sents for x, n in other.items(): self[x] += n else: for sent in other: self.n_sents += 1 # import pdb;pdb.set_trace() if self.poscache is not None: if sent in self.poscache: tags = self.poscache[sent] else: self.poscache[sent] = tags = nltk.pos_tag( nltk.word_tokenize(sent)) else: tags = nltk.pos_tag(nltk.word_tokenize(sent)) for x in tags: tok, tag = x self[tag] += 1 if self.normalize: for x, n in self.items(): self[x] /= float(self.n_sents)
def create_synonyms(orig_word): ''' funation for creating synonyms by passing word ''' try: headers = { "X-Mashape-Key": "aIder4iWr4msh5Scn073WRoddmAEp1qA0I3jsnSR8lfJwtyzpg", "Accept": "application/json"} response = requests.get("https://wordsapiv1.p.mashape.com/words/{}/synonyms".format(orig_word), headers=headers) if response.status_code == 200: json = response.json() synonyms = json['synonyms'] # synonyms = nltk.word_tokenize(synonyms) synonyms = nltk.pos_tag(synonyms) word = nltk.word_tokenize(orig_word) word = nltk.pos_tag(word)[0] print(synonyms) good_syns = [] for syn in synonyms: print(word[1], syn[1]) if word[1] == syn[1]: print('*') good_syns.append(syn[0]) word = Word.objects.get_or_create(word=orig_word) for syn in good_syns[:2]: try: new_word = Word.objects.create(word=syn.lower(), is_synonym=True) except Exception: new_word = Word.objects.get(word=word) syn = Synonym.objects.create(word=new_word) syn.synonym_to.add(word) return good_syns except Exception as e: print(e)
def writeOut(lsummary_out, allwordsphrases=[], outputpath='.', gridset=''): # Write data out for the last folder (gridset) encountered - MUST BE A BETTER WAY THAN THIS? uWordsPhrases = uniqueSet(allwordsphrases) # Set of unique words. uwords =[] uphrases = [] words = [] phrases =[] wordtypes =[] wordtypes =[] total_wordsphrases = total_uwordsphrases = total_words = total_phrases = 0 ldata_out = UnicodeWriter(open(outputpath + '/'+ gridset +'/language-data.csv', 'wb'), delimiter=',', quotechar='"') ldata_out.writerow(["WORD", "NUMBER OF WORDS", "COUNT", "TYPE"]) # Output metrics to file. for item in uWordsPhrases: num_words = len(item.split()) item_count = allwordsphrases.count(item) if num_words == 1: # Single word word_type = nltk.pos_tag(item)[-1][-1] #word_type_help = nltk.help.upenn_tagset(word_type) # MAYBE CONVERT TAGS INTO MORE USEFUL WORDS?! ldata_out.writerow([item, str(num_words), str(item_count), word_type]) uwords.append(item) wordtypes.append(word_type) elif num_words > 1: # Phrase nltk_words = nltk.word_tokenize(item) word_pos = nltk.pos_tag(nltk_words) ### HOW TO DEAL WITH PHRASES??? word_types = [x[1] for x in word_pos] ldata_out.writerow([item, str(num_words), str(item_count), " ,".join(word_types)]) # HOW TO OUTPUT EACH POS TO A COLUMN??? uphrases.append(item) for item in allwordsphrases: num_words = len(item.split()) if num_words == 1: words.append(item) elif num_words > 1: phrases.append(item) uword_types = countDuplicatesInList(wordtypes) total_wordsphrases = len(allwordsphrases) total_uwordsphrases = len(uWordsPhrases) total_uwords = len(uwords) total_uphrases = len(uphrases) total_words = len(words) total_phrases = len(phrases) #["File Name", "Total Words or Phrases", "Total Unique Words or Phrases", "Total Words", "Total Phrases", "Total Unique Words", "Total Unique Phrases", "Types of Word"]) lsummary_out.writerow([gridset, str(total_wordsphrases), str(total_uwordsphrases), str(total_words), str(total_phrases), str(total_uwords), str(total_uphrases), ', '.join(map(str, uword_types))]) raw_words_out = open(outputpath + '/'+ gridset +'/raw-unique-words.text', 'wb') raw_words_out.writelines('\n'.join(uWordsPhrases).encode('utf-8')) raw_phrases_out = open(outputpath + '/'+ gridset +'/raw-unique-phrases.txt', 'wb') raw_phrases_out.writelines('\n'.join(uphrases).encode('utf-8')) raw_words_out = open(outputpath + '/'+ gridset +'/raw-wordsphrases.text', 'wb') raw_words_out.writelines('\n'.join(allwordsphrases).encode('utf-8'))
def load_data(article_text): global tagged_words, tagged_sentences, people, sentences # we give parameter to load everything from file and to save some time :) if "-f" in sys.argv: # tokenize & tag all words in article print "Tokenizing & tagging words..." tokens = nltk.tokenize.wordpunct_tokenize(article_text) tagged_words = nltk.pos_tag(tokens) pickle.dump(tagged_words, file('tagged_words.pickle', 'w')) # extract & tokenize each sentence separately print "Tokenizing & tagging sentences..." sentences = nltk.tokenize.sent_tokenize(article_text) pickle.dump(sentences, file('sentences.pickle', 'w')) tokenized_sentences = [nltk.tokenize.wordpunct_tokenize(s) for s in sentences] tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences] pickle.dump(tagged_sentences, file('tagged_sentences.pickle', 'w')) print "Searching for people..." instance = ner.NERFinder() people = instance.find(tagged_words, sentences, tagged_sentences) pickle.dump(people, file('people.pickle', 'w')) else: tagged_sentences = pickle.load(file('tagged_sentences.pickle', 'r')) tagged_words = pickle.load(file('tagged_words.pickle', 'r')) sentences = pickle.load(file('sentences.pickle', 'r')) people = pickle.load(file('people.pickle', 'r'))
def extract_pos_pair(event_mention_1, event_mention_2): trigger1="" extent1="" trigger2="" extent2="" for one_anchor in event_mention_1.findall("anchor"): trigger1=one_anchor[0].text for one_anchor in event_mention_2.findall("anchor"): trigger2=one_anchor[0].text for one_extent in event_mention_1.findall("extent"): extent1=one_extent[0].text for one_extent in event_mention_2.findall("extent"): extent2=one_extent[0].text text1 = nltk.word_tokenize(extent1) dict1 = nltk.pos_tag(text1) for one_pair in dict1: if one_pair[0] in trigger1 or trigger1 in one_pair[0]: pos1=one_pair[1] break text2 = nltk.word_tokenize(extent2) dict2 = nltk.pos_tag(text2) for one_pair in dict2: if one_pair[0] in trigger2 or trigger2 in one_pair[0]: pos2=one_pair[1] break return (pos1, pos2)
def _get_sentiments(self, d): sent_word_net = load_sent_word_net() poscache_filename = "poscache.json" try: poscache = json.load(open(poscache_filename, "r")) except IOError: poscache = {} # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html #import pdb;pdb.set_trace() sent = tuple(nltk.word_tokenize(d)) #pos_tag tags tokens with part of speech (noun, verb etc) if poscache is not None: if d in poscache: tagged = poscache[d] else: poscache[d] = tagged = nltk.pos_tag(sent) else: tagged = nltk.pos_tag(sent) pos_vals = [] neg_vals = [] nouns = 0. adjectives = 0. verbs = 0. adverbs = 0. for w,t in tagged: p, n = 0,0 sent_pos_type = None if t.startswith("NN"): sent_pos_type = "n" nouns += 1 elif t.startswith("JJ"): sent_pos_type = "a" adjectives += 1 elif t.startswith("VB"): sent_pos_type = "v" verbs += 1 elif t.startswith("RB"): sent_pos_type = "r" adverbs += 1 if sent_pos_type is not None: sent_word = "%s/%s"%(sent_pos_type, w) if sent_word in sent_word_net: p,n = sent_word_net[sent_word] pos_vals.append(p) neg_vals.append(n) l = len(sent) avg_pos_val = np.mean(pos_vals) avg_neg_val = np.mean(neg_vals) #import pdb;pdb.set_trace() return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val, nouns/l, adjectives/l, verbs/l, adverbs/l]
def load_data(path): sentences_pos = [] r1 = re.compile(r'\<([^ ]+)\>') r2 = re.compile(r'\$US(\d)') for l in open(path): if not l.strip(): continue l = l.decode('utf-8') l = l.replace(u'’', "'") l = l.replace(u'``', '"') l = l.replace(u"''", '"') l = l.replace(u"—", '--') l = l.replace(u"–", '--') l = l.replace(u"´", "'") l = l.replace(u"-", " ") l = l.replace(u"/", " ") l = r1.sub(r'\1', l) l = r2.sub(r'$\1', l) s = l.strip().split('\t') sa, sb = tuple(nltk.word_tokenize(s) for s in l.strip().split('\t') if s) # ignore double \t sa, sb = ([x.encode('utf-8') for x in sa], [x.encode('utf-8') for x in sb]) for s in (sa, sb): for i in xrange(len(s)): if s[i] == "n't": s[i] = "not" elif s[i] == "'m": s[i] = "am" sa, sb = fix_compounds(sa, sb), fix_compounds(sb, sa) sentences_pos.append((nltk.pos_tag(sa), nltk.pos_tag(sb))) return sentences_pos
def synsym(s1,s2): ts0 = nltk.pos_tag(nltk.word_tokenize(s1)) ts1 = nltk.pos_tag(nltk.word_tokenize(s2)) # adj jj0 = [x for x,y in ts0 if y=='JJ' or y=='JJR' or y=='JJS'] jj1 = [x for x,y in ts1 if y=='JJ' or y=='JJR' or y=='JJS'] if len(jj0) == 0 or len(jj1) ==0: jjps = 0 else: v1 = makeFeatureVec(jj0,model,300) v2 = makeFeatureVec(jj1,model,300) jjps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2)) # noum jj0 = [x for x,y in ts0 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT'] jj1 = [x for x,y in ts1 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT'] if len(jj0) == 0 or len(jj1) ==0: nps = 0 else: v1 = makeFeatureVec(jj0,model,300) v2 = makeFeatureVec(jj1,model,300) nps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2)) # verb jj0 = [x for x,y in ts0 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ'] jj1 = [x for x,y in ts1 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ'] if len(jj0) == 0 or len(jj1) ==0: vps = 0 else: v1 = makeFeatureVec(jj0,model,300) v2 = makeFeatureVec(jj1,model,300) vps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2)) return [jjps,nps,vps]
def nltk_filter(sent): b1, b2 = sent.split(blockSeparator) b2 = b2.rstrip() b1 = b1.lower() tokens = word_tokenize(b1) pos_tags = pos_tag(tokens) filtered_sent = ' ' for token in tokens: filtered_sent += '1'+token + ' ' # for pos_t in pos_tags: # if pos_t[1] in filterList: # #filtered_sent += stemmer.stem(pos_t[0]) + ' ' # filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' ' #note: 1 concat stemmer(word) == stemmer(1 concat word) b2 = b2.lower() tokens = word_tokenize(b2) pos_tags = pos_tag(tokens) # filtered_sent = ' ' # for pos_t in pos_tags: # if pos_t[1] in filterList: # #filtered_sent += stemmer.stem(pos_t[0]) + ' ' # filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' ' for token in tokens: filtered_sent += '2' + token + ' ' return filtered_sent
def glv_window_overlap(t1, t2, n = 5): ''' Looks for an alignment within the window between sentences (non-overlapping within the sentence) and words with compatible lemmas POS. Emits features regarding the distance between common words, and finds the glv vector difference between pos-tag aligned words, inversely weighted by sentence distance. ''' ''' Looks within a window of influence around word matches for context, and compares the glove vectors within the (n - 1) gram context. Produces dim * (n - 1) dense features.''' features = Counter() v_tagged = pos_tag(leaves(t1)) w_tagged = pos_tag(leaves(t2)) for v in ntuples(v_tagged, n): for w in ntuples(w_tagged, n): # Find alignment alignments = find_exact_alignments(v, w) for i, j in alignments: ''' Featurize the word alignment in the window ''' features[v[i][0] + str(i - j) ] += 1 if not alignments: continue else: similar_align = find_tagged_alignments(v, w, alignments) for i, j in similar_align: word_diff = np.exp ( glvvec( v[i][0]) - glvvec( w[j][0]) ) for dim in range(word_diff.shape[0]): features[ v[i][1] + ' aligned dim ' + str(dim)] += word_diff[dim] return features
def normalize_word(word, lowercase=True, lemmatize=True): "Normalize word by stripping plural nouns" global NORMWORD_CACHE global NORMWORD_POS if NORMWORD_WNL is None: init_normword_wnl() if lowercase: word = word.lower() if word in NORMWORD_CACHE: return NORMWORD_CACHE[word] if not lemmatize: return word treebank_tag = nltk.pos_tag([word])[0][1] newword = word if ( len(newword) > 4 ) and ( treebank_tag == 'NNS' ): # Only lemmatize plural nouns, leave verbs alone wnpos = get_wordnet_pos(treebank_tag) if wnpos: newword = NORMWORD_WNL.lemmatize(newword, wnpos) if newword != word: LOGGER.debug('Changing %s to %s' % (word, newword)) NORMWORD_POS[newword] = nltk.pos_tag([newword])[0][1] else: NORMWORD_POS[word] = treebank_tag NORMWORD_CACHE[word] = newword return newword
def tokenizeme(self, LanguageSample): self.tokenized_text=nltk.word_tokenize(LanguageSample) self.unique_words=list(set(self.tokenized_text)) self.unique_words.sort() self.unique_words=nltk.pos_tag(self.unique_words) #Unique words does not get rid of infectional morpheme duplicates self.tagged_text = [i for i in nltk.pos_tag(self.tokenized_text) if i[1]!="."] #pos_tag gets the part of speech, loop removes punctuation self.count = len(self.tagged_text)
def replace_proper_nouns(self, o_sent, n_sent): proper_nouns = [] p_pnouns = [] o_tagged = pos_tag(word_tokenize(o_sent)) n_tagged = pos_tag(word_tokenize(n_sent)) # print("\nTransforming the output:") # print("Input sentence:", o_sent) # print("Found sentence:", n_sent) # print("Input sentence tagged:", o_tagged) # print("Found sentence tagged:", n_tagged) for o in o_tagged: if o[1] == 'NNP' and o not in proper_nouns: proper_nouns.append(o) for n in n_tagged: if (n[1] == 'PRP' or n[1] == 'PRP$' or n[1] == 'NNP') and n not in p_pnouns: p_pnouns.append(n) # print("") if (len(proper_nouns) == 1) and (len(p_pnouns) > 0): n_sent = sub(r"\b%s\b" %p_pnouns[0][0] , proper_nouns[0][0], n_sent, 1) gender = self.gp.classify(proper_nouns[0][0]) # print(proper_nouns[0][0], "is classified as", gender) for pnoun in p_pnouns: n_pnoun = self.change_gender(pnoun[0], gender) n_sent = sub(r"\b%s\b" %pnoun[0] , n_pnoun, n_sent) elif len(proper_nouns) < 1: print("No proper nouns to replace") else: print("Not yet implemented, :P") return n_sent
def getLemma(text, contextFlag=False): lemmatizer = WordNetLemmatizer() #'NN':wordnet.NOUN,'JJ':wordnet.ADJ,'VB':wordnet.VERB,'RB':wordnet.ADV wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'} result = None if text.split() == 1: # on word tokenized = word_tokenize(t) tagged = pos_tag(tokenized)[0] lemma = '' try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]]) except: lemma = lemmatizer.lemmatize(tagged[0]) result = lemma elif text.split() > 1 and contextFlag == True: # mutiple words i.e. text and without considering the context resultList = [] for t in text.split(): tokenized = word_tokenize(t) tagged = pos_tag(tokenized)[0] lemma = '' try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]]) except: lemma = lemmatizer.lemmatize(tagged[0]) resultList.append(lemma) result = ' '.join(resultList) else: # mutiple words i.e. text and consider the context resultList = [] tokens = word_tokenize(text) tagged = pos_tag(tokens) for t in tagged: try: resultList.append(lemmatizer.lemmatize(t[0],wordnet_tag[t[1][:2]])) except: resultList.append(lemmatizer.lemmatize(t[0])) result = ' '.join(resultList) return result
def expand_with_wordnet(query): """ This function expands every contentful word in the query with its wordnet definition. The word itself is not removed. Stop words are removed from the word definition as well. (Contentful means that it is not a stopword or punctuation sign) INPUT: query -- user query that is a simple string OUTPUT: expanded_query -- user query + definitions of contentful words """ stop = stopwords.words("english") stop += EXCLUDED contentful_tokens = [tok for tok in query.split() if tok not in stop] # take the first definition for the current word defs = [] for token in contentful_tokens: syn1 = wn.synsets(token, pos=wn.ADJ)[:1] syn2 = wn.synsets(token, pos=wn.NOUN)[:1] # we take into account only adj defs if syn1: defs.append(token) def_tokenized = word_tokenize(syn1[0].definition()) [defs.append(t[0]) for t in pos_tag(def_tokenized) if t[1] in ["NN", "JJ"]] elif syn2: defs.append(token) def_tokenized = word_tokenize(syn2[0].definition()) [defs.append(t[0]) for t in pos_tag(def_tokenized) if t[1] in ["NN", "JJ"]] # expansion can add some EXCLUDED words back in the query defs = set(defs) - set(EXCLUDED) # removing again expanded = " ".join(defs) return expanded
def jaccard_similarity(statement, other_statement, threshold=0.5): """ The Jaccard index is composed of a numerator and denominator. In the numerator, we count the number of items that are shared between the sets. In the denominator, we count the total number of items across both sets. Let's say we define sentences to be equivalent if 50% or more of their tokens are equivalent. Here are two sample sentences: The young cat is hungry. The cat is very hungry. When we parse these sentences to remove stopwords, we end up with the following two sets: {young, cat, hungry} {cat, very, hungry} In our example above, our intersection is {cat, hungry}, which has count of two. The union of the sets is {young, cat, very, hungry}, which has a count of four. Therefore, our Jaccard similarity index is two divided by four, or 50%. Given our threshold above, we would consider this to be a match. """ from nltk.corpus import wordnet import nltk import string a = statement.text b = other_statement.text # Get default English stopwords and extend with punctuation stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(string.punctuation) stopwords.append('') lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() def get_wordnet_pos(pos_tag): if pos_tag[1].startswith('J'): return (pos_tag[0], wordnet.ADJ) elif pos_tag[1].startswith('V'): return (pos_tag[0], wordnet.VERB) elif pos_tag[1].startswith('N'): return (pos_tag[0], wordnet.NOUN) elif pos_tag[1].startswith('R'): return (pos_tag[0], wordnet.ADV) else: return (pos_tag[0], wordnet.NOUN) ratio = 0 pos_a = map(get_wordnet_pos, nltk.pos_tag(nltk.tokenize.word_tokenize(a))) pos_b = map(get_wordnet_pos, nltk.pos_tag(nltk.tokenize.word_tokenize(b))) lemmae_a = [lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_a \ if pos == wordnet.NOUN and token.lower().strip(string.punctuation) not in stopwords] lemmae_b = [lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_b \ if pos == wordnet.NOUN and token.lower().strip(string.punctuation) not in stopwords] # Calculate Jaccard similarity try: ratio = len(set(lemmae_a).intersection(lemmae_b)) / float(len(set(lemmae_a).union(lemmae_b))) except Exception as e: print('Error', e) return (ratio >= threshold)
def demo(): # split paragraph into sentences using punct sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') sents = sent_tokenizer.tokenize(paragraphs) # split sentence into tokens (wrods + puncts) s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." WordPunctTokenizer().tokenize(s) #['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] PunktWordTokenizer().tokenize(s) #['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] PunktWordTokenizer().span_tokenize(s) #[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] #split the paragraph into sentence nltk.sent_tokenize(s) #split sentence into word and punct nltk.word_tokenize(s) # pos tagging nltk.pos_tag(nltk.word_tokenize(s))
def text_to_pos_list(lst): dpos_list = [] tpos_list = [] for line in lst: if "IsTruthFul" in line: continue else: if line[0] == "0": #If deceptive: dpos_list.append("<r>") for sent in nltk.tokenize.sent_tokenize(parse_line(line)): dpos_list.append("<s>") text = nltk.word_tokenize(sent) tagged = nltk.pos_tag(text) for t in tagged: dpos_list.append(t) dpos_list.append("</s>") dpos_list.append("</r>") else: tpos_list.append("<r>") for sent in nltk.tokenize.sent_tokenize(parse_line(line)): tpos_list.append("<s>") text = nltk.word_tokenize(sent) tagged = nltk.pos_tag(text) for t in tagged: tpos_list.append(t) tpos_list.append("</s>") tpos_list.append("</r>") return (dpos_list, tpos_list)
def redundant(text1, text2): tag1 = nltk.pos_tag(text1) tag2 = nltk.pos_tag(text2) l1=len(tag1) l2=len(tag2) i=0 count = 0 while i < l1 : j = 0 #print(tag1[i]) s1 = tag1[i] #print(s1[1]) while j < l2 : s2 = tag2[j] if str(s1[1]) == str(s2[1]) and str(s1[0]) == str(s2[0]) : #print(s1[0]+s1[1]) count = count + 1 j = j + 1 i = i + 1 match = 2*count / (l1 + l2) match = match * 100 #print(str(count)) #if count > 1 : #print("match percent ",match,"% ") #return 1 #else: if match > 70: return 1 else: return 0
def extract_entities2(text): entities = [] """t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t2.evaluate(test_sents)""" for sentence in sent_tokenize(text): #print pos_tag(nltk.word_tokenize(sentence)) print sentence tags=pos_tag(nltk.word_tokenize(sentence)) tags=tagear(tags) chunks = ne_chunk(pos_tag(nltk.word_tokenize(sentence))) #chunks = ne_chunk(regexp_tagger.tag((nltk.word_tokenize(text)))) chunks = ne_chunk(tags) #chunks.draw() #print chunks for chunk in chunks: #print chunk #if hasattr(chunk, 'node'): # print chunk.node if hasattr(chunk, 'node') : print chunk entities.extend([chunk for chunk in chunks if hasattr(chunk, 'node')]) return entities
def test_nltkNERParsing(self): testString = 'Natural Sciences and Engineering Research Council of Canada' unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams() posTagged = nltk.pos_tag(unigrams) chunked = nltk.ne_chunk(posTagged) getGPEs = [] for treeBranch in chunked: if hasattr(treeBranch, 'label') and treeBranch.label() == 'GPE': getGPEs.append(str(treeBranch)) self.assertEqual(1, len(getGPEs)) testString = 'Milwaukee Foundation' unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams() posTagged = nltk.pos_tag(unigrams) chunked = nltk.ne_chunk(posTagged) # returns (S (PERSON Milwaukee/NNP) (ORGANIZATION Foundation/NNP)) testString = 'New England Board of Higher Education' unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams() posTagged = nltk.pos_tag(unigrams) chunked = nltk.ne_chunk(posTagged) # returns (S (GPE New/NNP)(ORGANIZATION England/NNP Board/NNP) of/IN (PERSON Higher/NNP Education/NNP)) testString = 'New England Board of Higher Education' unigrams = TokenizeOnWhitespacePunctuation(testString).getUnigrams() posTagged = nltk.pos_tag(unigrams) chunked = nltk.ne_chunk(posTagged)
def parse_stock_name(self, stockname): p = engine() instruction_set = stockname.split(',') word_list = instruction_set[0].split(' ') index = 1 categories_ignored = ['RB', 'TO'] tokens = word_tokenize(instruction_set[0]) tags = pos_tag(tokens) i=0 while i < len(tags): if tags[i][1] in categories_ignored: index += 1 i+= 1 else: break quantity = word_list[index-1] disallowed = ['g', 'ml', 'x', 'kg', 'cups', 'cup', 'grams', 'can', 'tbsp', 'tsp', 'tbsps', 'tsps', 'small', 'bunch', 'piece', 'handful', 'pack', 'chopped', 'large', 'a', 'pinch', 'fresh', 'dried', 'heaped', 'thick', 'slices', 'slice', 'of', 'about'] while index < len(word_list): if word_list[index] not in disallowed: break else: index+=1 sentence = " ".join(word_list[index:]) tokens = word_tokenize(sentence) categories = pos_tag(tokens) words = [] for category in categories: if category[1] not in ['NNS', 'VBN', 'VBG']: words.append(category[0]) word = " ".join(words) return quantity, word, None
def test(ws,wf,s,pf,wm,alfa2): f1=open('test_data.data','rb') f2=open('test.csv','rb') val_text=f1.read() comt=f2.read().splitlines() val_lines=val_text.splitlines() acc=0 lc=0 for line in val_lines: token = line.split(' | ') token[2]="<S> "+token[2]+" <E>" t_t =token[2].split(' %% ') if t_t[0]!="<S> ": bff = nltk.pos_tag(t_t[0].split(".")[-1].split(" "))[-1][1] else: bff="<S>" if t_t[2]!=" <E>": aff = nltk.pos_tag(t_t[2].split(".")[0].split(" "))[0][1] else: aff="<E>" val_label = nb(ws,wf,s,token[0],pf,aff,bff,alfa2) if val_label==comt[lc].split(",")[1]: acc+=1 lc+=1 print float(acc)/len(val_lines) f1.close() f2.close()
def score_glove_pos(src, dst, numpy_arrays, labels_array, g, normalize=True): b1 = [] b2 = [] lines = 0 with open(src) as p: for i, line in enumerate(p): s = line.split('\t') b1.append(s[0]) b2.append(s[1][:-1]) #remove \n lines = i + 1 b1_pos = [nltk.pos_tag(nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', text))) for text in b1] b2_pos = [nltk.pos_tag(nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', text))) for text in b2] res = [] for i in range(lines): tags1 = [tag[0] for tag in b1_pos[i] if tag[1] in NOUN] tags2 = [tag[0] for tag in b2_pos[i] if tag[1] in NOUN] r = [1 - spatial.distance.cosine(g[tag1], g[tag2]) for tag1 in tags1 for tag2 in tags2 if tag1 in labels_array and tag2 in labels_array] if len(r) == 0: res.append(0) else: res.append(round(5*max(r), 2)) if normalize: res = normarlize_score(res) with open(dst, 'w') as thefile: thefile.write("\n".join(str(i) for i in res)) print src + ' finished!'
def m_surrounding(self): D = {} sent = self.sentence["form"] l = len(sent) #print sent K = self.index ''' for k in range(l): if sent[k] == self.word: K = k break ''' #print K, l tagp = tagn = "" if (K+1) < l: tagn = nt.word_tokenize(sent[K+1]) tagn = nt.pos_tag(tagn) if (K-1) >=0: tagp = nt.word_tokenize(sent[K-1]) tagp = nt.pos_tag(tagp) if tagp != "": D["ptag"] = tagp[0][1] else: D["ptag"] = "" if tagn != "": D["ntag"] = tagn[0][1] else: D["ntag"] = "" print D return D
def printer(sentencescorelist, sentenceList, wordscorelist, wordList): outFile = open('./tldr/outFile.txt', 'w') for s in range(0, len(sentenceList)): if s in sentencescorelist: printsentence(sentenceList[s], outFile) outFile.write("Topics to research: ") topics = [] numtopics = 3 poswords = nltk.pos_tag(wordList) poskeep = ["NN", "NNS", "NNP", "NNPS"] while numtopics > 0: temp = max(wordscorelist.iteritems(), key=operator.itemgetter(1))[0] templist = [temp] templist = nltk.pos_tag(templist) if templist[0][1] in poskeep: numtopics -= 1 topics.append(temp) del wordscorelist[temp] for i in range(0, len(topics)): if i != len(topics) - 1: outFile.write(topics[i] + ", ") else: outFile.write(topics[i]) outFile.close()
def test(ws,wf,s,pf): f1=open('validation_data.data','rb') #f2=open('test_data.csv','w') val_text=f1.read() val_lines=val_text.splitlines() acc=0 for line in val_lines: token = line.split(' | ') t_t =token[2].split(' %% ') if t_t[0]!="<S>": bff = nltk.pos_tag(t_t[0].split(".")[-1].split(" "))[-1][1] else: bff="<S>" if t_t[2]!="<\S>": aff = nltk.pos_tag(t_t[2].split(".")[0].split(" "))[0][1] else: aff="<\S>" val_label = nb(ws,wf,s,token[0],pf,aff,bff) #f2.write(token[0]+" | "+val_label+" | "+token[2]) #f1.close() #f2.close() #print "Done" if val_label==token[1]: acc+=1 print float(acc)/len(val_lines)
def make_pos(target_tag, edit_rev): tags, srcs, dsts = edit_rev sentence = '' if target_tag == del_tag: sentence = dsts elif target_tag == add_tag: sentence = srcs if target_tag in tags: tag_indexes = [i for i, x in enumerate(tags) if x == target_tag] trimed = sentence for tag_index in tag_indexes: trimed = trimed[:tag_index] + trimed[tag_index+1:] posed = pos_tag(trimed) pos = [w[1] for w in posed] for tag_index in tag_indexes: pos.insert(tag_index, u'') # debug none_indexes = [i for i, x in enumerate(pos) if x == u''] if tag_indexes != none_indexes: print(tag_indexes, file=sys.stderr) print(none_indexes, file=sys.stderr) print(tags, file=sys.stderr) print(pos, file=sys.stderr) else: posed = pos_tag(u' '.join(sentence).split()) pos = [w[1] for w in posed] return pos
I am a student of UMKC. Studying masters in CS.""" s = word_tokenize(sentence) print(s) print('Stemming') from nltk.stem import PorterStemmer from nltk.tokenize import sent_tokenize, word_tokenize ps = PorterStemmer() example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"] for w in example_words: print(ps.stem(w)) print('parts of speech') pos = nltk.pos_tag(s) print(pos) print('lemmatization') lemmatize = nltk.WordNetLemmatizer() print lemmatize.lemmatize('schooling', pos='v') from nltk.util import ngrams from collections import Counter print('trigrams') trigrams = ngrams(s, 3) print Counter(trigrams) from nltk import pos_tag, ne_chunk import numpy print ne_chunk(pos_tag(wordpunct_tokenize(sentence)))
def getPOS(words): # POS tag based feature set # get the parts of speech tags parts_of_speech = nltk.pos_tag(words) ret = {} verbCount = 0 nounCount = 0 properNounCount = 0 adjCount = 0 prpCount = 0 word_list = [] punctuation = [".", ",", "!", "?", ";", ":", "\'", "\""] for (word, pos) in parts_of_speech: if word not in punctuation: word_list.append(word) if 'NNP' in pos: properNounCount += 1 elif 'PRP' in pos: prpCount += 1 # TODO possibly get social relationships # simplify the POS tag tag = map_tag('en-ptb', 'universal', pos) # increment pos counters if "NOUN" in tag: nounCount += 1 elif "ADJ" in tag: adjCount += 1 elif "VERB" in tag: verbCount += 1 wordCount = len(word_list) # record the percentages the pos np = 0 ap = 0 vp = 0 if (wordCount > 0): np = nounCount / wordCount ap = adjCount / wordCount vp = verbCount / wordCount # check the documentation for binning explanation # bin the nouns and add them to dictionary ret["nouns"] = np ret["adjectives"] = ap ret["verbs"] = vp if np < .145: ret["noun_percentage"] = 0 elif np < .255: ret["noun_percentage"] = 1 else: ret["noun_percentage"] = 2 # bin the adjectives and add them to dictionary if ap < .028: ret["adj_percentage"] = 0 elif ap < .096: ret["adj_percentage"] = 1 else: ret["adj_percentage"] = 2 # bin the verbs and add them to dictionary if vp < .13: ret["verb_percentage"] = 0 elif vp < .22: ret["verb_percentage"] = 1 else: ret["verb_percentage"] = 2 if (wordCount > 0): ret["Personal_Pronoun_Percentage"] = prpCount / wordCount else: ret["Personal_Pronoun_Percentage"] = 0 if (nounCount > 0): ret["Proper_Noun_Percentage"] = properNounCount / nounCount else: ret["Proper_Noun_Percentage"] = 0 ret["word_count"] = wordCount return (word_list, ret)
else: reNE = reNE + "|" + listNE[NE] m = re.search('\((' + reNE + ')(\s)', str(namedEnt[i])) if m: typeEntity = m.group(1) entityList.append((entity, typeEntity)) return [(elem[0], elem[1]) for elem in entityList] ### GET TOP N NAMED ENTITIES all_entities = [ get_entities( nltk.ne_chunk( nltk.pos_tag( nltk.word_tokenize( s.encode('utf-8').decode('unicode_escape'))))) for s in sents ] token_left = [entity[0] for entities in all_entities for entity in entities] print(token_left[0:100]) fdist = nltk.FreqDist(token_left) for k in (sorted(fdist, key=fdist.__getitem__, reverse=True)[0:30]): if (has_wikipedia_page(k)): print(k, fdist[k]) #=============================================================================== # IN = re.compile(r'.*\bin\b(?!\b.+ing)') # for doc in [nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(s.encode('utf-8').decode('unicode_escape')))) for s in sents]: # for rel in nltk.sem.extract_rels('PERSON', 'LOC', doc, pattern = IN): # print(nltk.sem.rtuple(rel))
def tags_for_sent(sentence): return map(change_to_wordnet_tag, nltk.pos_tag(nltk.word_tokenize(sentence)))
import nltk, random from nltk.corpus import nps_chat from nltk.corpus import brown from nltk import word_tokenize posts = nltk.corpus.nps_chat.xml_posts() featuresets = [nltk.pos_tag(word_tokenize(post.text)) for post in posts] t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(featuresets, backoff=t0) t2 = nltk.BigramTagger(featuresets, backoff=t1) ##text = word_tokenize("I am good"); ##print(t2.tag(text)); ##print(text); from nltk.corpus import movie_reviews as movies pos_docs = movies.fileids('pos') neg_docs = movies.fileids('neg') classifier_training = [] for doc in pos_docs: sents = movies.sents(doc) for sent in sents: tagged = t2.tag(sent) words = [w for w, k in tagged] tags = [k for w, k in tagged] feature = {} for i in range(len(words) - 1): feature[words[i] + ' ' + words[i + 1]] = tags[i] + ' ' + tags[i + 1]
text = "I'm booking hotel" # 1 tokenizzazione tokens = nltk.word_tokenize(text) #print(nltk.pos_tag(tokens)) for i in tokens: pass #print(nltk.pos_tag([i])) # 2 noun selection (tagging), eliminazione stopwords e case lettere. # POSTAG funziona bene se gli passi l'insieme di token e non la singola parola # cap 5 parte 1 candidates_tokens = [ token[0].lower() for token in nltk.pos_tag(tokens) if token[1][0:2] == 'NN' and token[0] not in nltk.corpus.stopwords.words('english') ] #print(candidates_tokens) # 3.1 stemming Porter cap 3 parte 3.6 USO SOLO QUESTO # Uso il Porter se voglio indicizzare del testo e fare ricerca usando parole alternative. porter = nltk.PorterStemmer() tokens_stemmed1 = [porter.stem(token) for token in candidates_tokens] #print(tokens_stemmed1) # 3.2 stemming Lancaster porter = nltk.LancasterStemmer() tokens_stemmed2 = [porter.stem(token) for token in candidates_tokens] #print(tokens_stemmed2)
from nltk import pos_tag from nltk.stem import PorterStemmer from word_setting import * print(pos_tag(tool_keywords1))
"k10", "k11", "k12", "k13", "k14", "k15" ], header=0, sep=',', error_bad_lines=False, encoding='utf-8') #TODO later: Include emoji about weater in tokenizer () tokenized_text = [] #Include % amd minus for tempurature forcast figures tokenizer = RegexpTokenizer("\w+|%|-") #Start pre-processing for tweet in train_data.tweets: #Tokenize tokens = tokenizer.tokenize(tweet) #Pos tagging append_pos = [] tagged_tokens = nltk.pos_tag(tokens) for posTag in tagged_tokens: # Tagging is case sensitive, so lower needs to be after lower_word = posTag[0].lower() #Keep all verbs, adj, noun, adv if (posTag[1].startswith("V") or posTag[1].startswith("J") or posTag[1].startswith("N") or posTag[1].startswith("R")): append_pos.append(lower_word) #Append each tokenized tweet in the list tokenized_text.append(append_pos)
# Opening input file in read mode inputF = open(inputFilePath, "r") # Opening the intermediate and the final file in write mode outputF = open(intermFilePath, "w") outputF1 = open(outputFilePath, "w") # Holds all tag patterns taglist = [] # Reading through the vocab file # Tagging each line # Extract just the tag from the tagged line and insert it into the list for line in inputF: tokens = nltk.word_tokenize(line) tagggedT = nltk.pos_tag(tokens) temp = " " temp2 = " " for eachToken in tagggedT: temp2 = temp2 + eachToken[0] + "/" + eachToken[1] + " " temp = temp + eachToken[1] + " " taglist.append(temp) temp2 = temp2 + "\n" outputF.write(temp2) tagset = set(taglist) # print tagset count = 0 # Fetting the tag patterns which appear more than 100 times for val in tagset: mt = re.match(r'^(( [A-Z]+)? ([A-Z]+ )?NN[A-Z]* )$', val)
def posTagging(self, corpus): tags = [] for sentence in corpus: tag = nltk.pos_tag(sentence) tags.append(tag) return tags
def ie_preprocess(document): sentences = nltk.sent_tokenize(document) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] return(sentences)
def tagList(jsonList: list, whWord: str, collocate: str, context: str): for obj in jsonList: if not obj["sentence"] or obj["sentence"] == "": # Skip conditions continue sent = obj["sentence"] #iterate through list reversed, find the verb first and then the wh obj["clean_sentence"] = None tok_sents = sent_tokenize(sent) for s in tok_sents: if whWord.lower() in s.lower() and context.lower() in s.lower(): sent = s obj["clean_sentence"] = s break tagged = pos_tag(word_tokenize(sent)) clauseType = None modal = None verb = None modals = [ 'am', 'is', 'are', 'was', 'were', 'being', 'been', 'be', 'have', 'had', 'has', 'do', 'does', 'did', ' can', 'could', 'may', 'might', 'shall', 'should', 'will', 'would', 'must' ] context_wh, wh_collocate = f.get_sets_backwards( tagged, context, whWord, collocate) obj['context_wh'] = str(context_wh) obj['wh_collocate'] = str(wh_collocate) obj['wh'] = whWord obj['phrase'] = context try: #tag relative clauses if f.x_in_set("N", context_wh, is_pos=True) or f.x_in_set( "DT", context_wh, is_pos=True) or f.x_in_set( "JJ", context_wh, is_pos=True): clauseType = "Relative Clause" #tag infinitive clausesroorunl elif f.x_in_set("to", wh_collocate, is_pos=False): clauseType = "Non-Finite" verb = f.get_pos_word_in_set(wh_collocate, 'V') #wh__modal__NNP__VB elif f.x_in_set(modals, wh_collocate, is_pos=False): clauseType = "Modal" modal = f.get_pos_word_in_set(wh_collocate, 'M') verb = f.get_pos_word_in_set(wh_collocate, 'V') else: clauseType = "Finite" verb = f.get_pos_word_in_set(wh_collocate, 'V') except: print("BROKE HERE: ") print(obj["resNumber"]) print(sent) print(tagged) print(context_wh) break obj['clauseType'] = clauseType obj['modal'] = modal obj['verb'] = verb return jsonList
short_pos = open("short_reviews/positive.txt", "r").read() short_neg = open("short_reviews/negative.txt", "r").read() # move this up here all_words = [] documents = [] # j is adject, r is adverb, and v is verb #allowed_word_types = ["J","R","V"] allowed_word_types = ["J"] for p in short_pos.split('\n'): documents.append((p, "pos")) words = word_tokenize(p) pos = nltk.pos_tag(words) for w in pos: if w[1][0] in allowed_word_types: all_words.append(w[0].lower()) for p in short_neg.split('\n'): documents.append((p, "neg")) words = word_tokenize(p) pos = nltk.pos_tag(words) for w in pos: if w[1][0] in allowed_word_types: all_words.append(w[0].lower()) save_documents = open("pickled_algos/documents.pickle", "wb") pickle.dump(documents, save_documents) save_documents.close()
x = i.split('.')[1] y = i.split('.')[0] word_tokens[num][n] = y word_tokens[num].append(x) elif (len(i) == 1): word_tokens[num][n] = '' ##stripping leading and lagging spaces from words and appending them to a new list word_tokens_ = [] for i in word_tokens: temp = [j.strip(' ') for j in i] temp_1 = [i for i in temp if len(i) > 2] word_tokens_.append(temp_1) ##creating pos_tags for the corpus pos_tags = [nltk.pos_tag(i) for i in word_tokens_] ##removing NNPs, Prepositions, modular verbs or fillers, foreign words NNP = [] CD = [] IN = [] MD = [] for i in pos_tags: for k, v in i: if v == 'NNP': NNP.append(k) elif v == 'CD': CD.append(k) if v == 'MD':
def answer_eight(): parts_of_speech = nltk.pos_tag(text1) count = nltk.FreqDist(tag for (word, tag) in parts_of_speech) answer = count.most_common()[:6] output = [i for i in answer if i[0] != ','] return output
def fetch_words_from_news(url): # return [[],[]] 0: en, 1:tr article = Article(url) article.download() article.parse() for_nltk = [] news_text = article.text for_nltk.append(article.text) news_text = news_text.upper() news_text_wo_rn = news_text.replace('\n', ' ') news_text_wo_rn = news_text_wo_rn.replace('\r', ' ') news_text_list = news_text_wo_rn.split(' ') news_text_list = set(news_text_list) tokenized_sents = [word_tokenize(i) for i in for_nltk] # remove punctuations from list res = [] new_res = [] #s.translate(None, string.punctuation) #res = [s.translate(str.maketrans('', '', string.punctuation)) for s in tokenized_sents[0] for tixt in tokenized_sents[0]: new_tixt = ''.join( c.translate(str.maketrans('', '', string.punctuation + '“”')) for c in tixt if c not in string.punctuation + '“”') res.append(new_tixt) for d in res: if not d == '': new_res.append(d) capitalized_new_res = [KAP.upper() for KAP in new_res] capitalized_setted_new_res = set(capitalized_new_res) # delete one len item more_than_one_len_CSNR = [] for e in capitalized_setted_new_res: if not len(e) < 2: more_than_one_len_CSNR.append(e) # delete numbers digitless_more_than_OLC = [] for g in more_than_one_len_CSNR: if g.isalpha(): digitless_more_than_OLC.append(g) tags_of_diggless = [nltk.pos_tag(f) for f in digitless_more_than_OLC] tags_of_diggless_2 = nltk.pos_tag(digitless_more_than_OLC) prepless_digitless_MTO = [] for h in digitless_more_than_OLC: if not h.lower() in stop_words: prepless_digitless_MTO.append(h) if_word_in_cor_PDMTO = [] TR_if_word_in_cor_PDMTO = [] for g in prepless_digitless_MTO: if g.lower() in words.words(): if_word_in_cor_PDMTO.append(g) tr.set_text(g) TR_if_word_in_cor_PDMTO.append(tr.translate()) return [if_word_in_cor_PDMTO, TR_if_word_in_cor_PDMTO] # return [[],[]] 0: en, 1:tr
no_punct = [token for token in sent_t if token not in punctuation] temp.append([token for token in no_punct if token not in stops]) sent_tokens = temp word_tokens = [token for token in word_tokens if token not in punctuation] word_tokens = [token for token in word_tokens if token not in stops] # Stems are basic versions of words stemmer = PorterStemmer() stems = {token: stemmer.stem(token) for token in word_tokens} # Lemmas look at the meaning of the word lemmatizer = WordNetLemmatizer() lemmas = {token: lemmatizer.lemmatize(token) for token in word_tokens} tagged_sent = [nltk.pos_tag(sent) for sent in sent_tokens] tagged_words = nltk.pos_tag(word_tokens) ne_chunked = nltk.ne_chunk(tagged_words, binary=True) vader_analyzer = SentimentIntensityAnalyzer() polarity_scores = [vader_analyzer.polarity_scores(sent) for sent in sentences] top_tokens(word_tokens) top_stems(stems) top_lemmas(lemmas) top_nouns_verbs(tagged_sent) top_entities(ne_chunked) top_sentiment_sentence(polarity_scores) identify_weird_words(word_tokens) disp_text(text)
return conf # # training-data short_pos = open("short_reviews\positive.txt", "r").read() short_neg = open("short_reviews\\negative.txt", "r").read() documents, all_words = [], [] # J is adjective, R is adverb, and V is verb # allowed_word_types = ["J","R","V"] allowed_word_types = ["J"] for p in short_pos.split("\n"): documents.append((p, "pos")) words = word_tokenize(p) short_pos_words = nltk.pos_tag(words) for w in short_pos_words: if w[1][0] in allowed_word_types: all_words.append(w[0].lower()) for n in short_neg.split("\n"): documents.append((n, "neg")) words = word_tokenize(n) short_neg_words = nltk.pos_tag(words) for w in short_neg_words: if w[1][0] in allowed_word_types: all_words.append(w[0].lower()) documents_temp = open("pickled_algorithms/documents.pickle", "rb") documents = pickle.load(documents_temp) documents_temp.close()
parser = BeautifulSoup(html, 'html.parser') texts = parser.findAll('p', attrs={'class': 'body-text'}) # remove empty tags texts = [text for text in texts if len(text) > 0] # remove HTML tags texts = [re.sub(r"<.*?>", "", str(text)) for text in texts] texts = texts[0] #for text in texts: tokens = nltk.word_tokenize(texts) tokens_pos = nltk.pos_tag(tokens) chunks = nltk.ne_chunk(tokens_pos) tagged_chunks = [chunk for chunk in chunks if type(chunk) != tuple] #print(tagged_chunks) ''' Annotation guidelines: For annotation we looped through all tokens in the text and annotated each token if it is either (part of) a person (PERSON), location (GPE) or organization (ORGANIZATION) ''' ''' CoNLL2003 format: Token POS-tag Gold standard NER tag Actual Tag Poep N O Person ''' #Manually annotate and store (token_index, tag)
def extract_data(status): url = "traffic_data.txt" names = ['message', 'outcome'] dataset = pandas.read_csv(url, names=names) dataset_x = dataset["message"] dataset_y = dataset["outcome"] cv = TfidfVectorizer(min_df=1, stop_words='english') x_train, x_test, y_train, y_test = model_selection.train_test_split( dataset_x, dataset_y, test_size=0.8, random_state=2) x_train_cv = cv.fit_transform(x_train) x_test_cv = cv.transform(x_test) classifier = LogisticRegression() classifier.fit(x_train_cv, y_train) predictions = classifier.predict(x_test_cv) print(accuracy_score(y_test, predictions)) #Accuracy printing sentence = status sentence = sentence.lower() words = word_tokenize(sentence) #Make of NER code. filename = "place.txt" data_place = pandas.read_csv(filename, names=['place']) data_placename = data_place['place'] jam_place_integer = 0 #This will be needed for first and second phase jam_place2_integer = 0 #This will be needed for third phase jam_place3_integer = 0 #This will be needed for third phase because 'to' will have two places enter_to_second_phase = 0 enter_to_third_phase = 0 enter_to_fourth_phase = 0 if 'near' in words or 'at' in words or 'of' in words: for x in range(0, len(words)): if words[x] == 'near' or words[x] == 'at' or words[x] == 'of': jam_place_integer = x + 1 else: enter_to_second_phase = 1 if enter_to_second_phase == 1: if 'in' in words: for x in range(0, len(words)): if words[x] == 'in': if (str(words[x + 1]) == 'traffic' or str(words[x + 1]) == 'jam' or str(words[x + 1]) == 'grid' or str(words[x + 1]) == 'lock'): enter_to_third_phase = 1 else: pos_tag_of_next_word = nltk.pos_tag( word_tokenize(words[x + 1])) word, tag = zip(*pos_tag_of_next_word) pos_tag_of_next_word_str = str(''.join(tag)) if pos_tag_of_next_word_str == 'NN' or pos_tag_of_next_word_str == 'NNP': jam_place_integer = x + 1 else: enter_to_third_phase = 1 if enter_to_third_phase == 1: if 'to' in words: for x in range(0, len(words)): if words[x] == 'to': jam_place2_integer = x + 1 #Now finding if previous word is a place also pos_tag_of_previous_word = nltk.pos_tag( word_tokenize(words[x - 1])) word, tag = zip(*pos_tag_of_previous_word) pos_tag_of_previous_word_str = str(''.join(tag)) if pos_tag_of_previous_word_str == 'NN' or pos_tag_of_previous_word_str == 'NNP': jam_place3_integer = x - 1 else: enter_to_fourth_phase = 1 #This method creates problem which name has two separate parts. like Manik mia if enter_to_fourth_phase == 1: for w in words: for x in range(data_placename.count()): if str(w) == str(data_placename[x]): jam_place = w #This method creates problem which name has two separate parts. jam_place_final_result = '' if jam_place_integer != 0: jam_place = words[jam_place_integer] jam_place_final_result = jam_place if enter_to_third_phase == 1 and enter_to_fourth_phase == 0: if jam_place3_integer == 0: jam_place = words[jam_place2_integer] jam_place_final_result = jam_place if jam_place3_integer != 0: jam_place2 = words[jam_place2_integer] jam_place3 = words[jam_place3_integer] jam_place_final_result = jam_place3 + ' to ' + jam_place2 if enter_to_fourth_phase == 1: jam_place_final_result = jam_place #End of NER test_line_tfidf = cv.transform([sentence]) prediction = classifier.predict(test_line_tfidf) final_result = '' if str(prediction) == '[0]': final_result = 'There may be no traffic jam at ' + jam_place_final_result if str(prediction) == '[1]': final_result = 'There may be traffic jam at ' + jam_place_final_result if str(prediction) == '[2]': final_result = 'Someone is trying to know the road condition of ' + jam_place_final_result print(final_result)
# 텍스트 분석을 위해 nltk 모듈을 불러온다 import nltk # 전처리하고자 하는 문장을 String 변수로 저장한다 sent1 = 'My only regret in life is that I did not drink more wine.' sent2 = 'I drink to make other people more interesting.' sent3 = 'An intelligent man is sometimes forced to be drunk to spend time with his fools.' # 각 문장을 토큰화한 후 품사 태깅을 해 결과를 출력한다 print('POS tagging Sentence 1:') tokens1 = nltk.word_tokenize(sent1) # 문장을 토큰화한다 print(nltk.pos_tag(tokens1)) # 토큰화한 문장을 품사 태깅해 출력한다 print('POS tagging Sentence 2:') tokens2 = nltk.word_tokenize(sent2) # 문장을 토큰화한다 print(nltk.pos_tag(tokens2)) # 토큰화한 문장을 품사 태깅해 출력한다 print('POS tagging Sentence 3:') tokens3 = nltk.word_tokenize(sent3) # 문장을 토큰화한다 print(nltk.pos_tag(tokens3)) # 토큰화한 문장을 품사 태깅해 출력한다
def toke_n_tag(text): pos_tagged_text = pos_tag(word_tokenize(text)) return (pos_tagged_text, text)
def extractFeaturesAndWriteBio(READ_PATH,file_type): global ALL_poems,bio,cnt, start_time inp=0 sub_cnt=0 words_total=0 lines_total=0 pause_every = 0 for subdir, dirs, files in os.walk(READ_PATH): # RANDOM SELECT random.shuffle(files) for file in files: num_of_files = len(files)-1 # deduct the DS_store #print (num_of_files,'readDirectory',READ_PATH) if file_type in file and 'readme' not in file: # ID id=file.split(".")[0] #print "\n\n*********\nID:",id filenames.append(id) cnt+=1 # print('') #print('') # print('OPENED:',id) # print('') #print('') ############## # HOW MANY? # ############## sub_cnt+=1 if sub_cnt>=int(inp): if int(inp) != 0: end_time = time.time() es = end_time-start_time print sub_cnt, "poems,\n",lines_total,"lines,\n",words_total,"words \ngenerated in\n",("%.2f" % es),"seconds" words_total=0 lines_total=0 # RESTART sub_cnt=0 inp = raw_input("\n\n^^^^^^^^^^^^^^\n\nHow many poems do u want? ") if not inp: print "You entered nothing! 10 poems will be generated." inp=10 sleep_time = raw_input("\nSleep duration?") if not sleep_time: print "You entered no time! 10 second wait assigned." sleep_time=10 pause_every = raw_input("\nPause every 1 or 2 or ...?") if not pause_every: print "You entered nothing! Pause will occur every 10 poems." pause_every=10 print "\n\n^^^^^^^^^^^^^^^" start_time = time.time() print 'Poem #',sub_cnt poem_replaced = "" replacement_word = "" author="" titles="" title="" new_title="" replaced_ls =[] new_titles_ls = [] quit_language=0 ################################################################# # Load POEM TEXT FILE (based on id extracted from Alchemy JSON) # ################################################################# txt_fn_path = DATA_DIR + READ_TXT_PATH + id.split("_")[1]+".txt" #print "txt_fn_path:",txt_fn_path if os.path.isfile(txt_fn_path) and cnt>0: txt_data=open(txt_fn_path).read() # http://blog.webforefront.com/archives/2011/02/python_ascii_co.html # txt_data.decode('ISO-8859-2') .decode('utf-8') # unicode(txt_data) author=txt_data.split("****!****")[0].strip(' \t\n\r') title=txt_data.split("****!****")[1].strip(' \t\n\r') bio=txt_data.split("****!****")[2]#.strip(' \t\n\r') ###### CLEAN BIO bio.replace("\t","	") bio.replace("\n"," <br>") bio.replace("\r"," <br>") poem_replaced=bio #print poem_replaced ############################### # REPLACE AUTHOR NAME ############################## author_ln=author.split(" ")[-1] author_fn=author.split(" ")[:-1] # #poem_replaced = poem_replaced.replace(author_ln,"Jhave") ####################### # fake AUTHOR ####################### new_author= " ".join(random.choice(authors).split(" ")[1:-2])+" "+random.choice(authors).split(" ")[-2] ####################### # replace BOOK TITLES ####################### #print "TITLES"] new_title = getNewTitle("title").encode('utf-8') ############################ # replace years with another ############################ for w1 in poem_replaced.split("("): for w2 in w1.split(")"): if w2 is not None and w2.isdigit(): new_num = random.randint(int(w2)-5,int(w2)+5) #print "REPLACING #:",w2,new_num poem_replaced = poem_replaced.replace(w2,str(new_num)) replaced_ls.append(new_num) ################# # Load JSON # ################# response = loadJSONfile(READ_JSON_PATH+"poetryFoundation_"+id.split("_")[1]+"_Alchemy_JSON.txt") if response != "failed": if response.get('entities') is not None: for idx,entity in enumerate(response['entities']): #print idx ce = entity['text'].replace("0xc2"," ") ce = ce.replace("0xe2","'") ce = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, ce) ce = ce.encode('utf-8') try: content = ce.decode('utf-8').encode('ascii', 'xmlcharrefreplace') except UnicodeDecodeError: "AAAARGGGGHHH!!!!" if content in poem_replaced: ################################################ # Replace similar entities from other JSON # ################################################ replacement_entity = findSimilarEntityinRandomJSON(content,entity['type']) cr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, replacement_entity) poem_replaced = poem_replaced.replace(content,replacement_entity) replaced_ls.append(replacement_entity) ########################## # POS REPLACMENT # ########################## token_tuples = nltk.word_tokenize(poem_replaced) tt = nltk.pos_tag(token_tuples) ################# # ADJECTIVES # ################# for i in tt: if "/i" not in i[0] and len(i[0])>3 and i[0] != "died": origw = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, i[0]) origw =import_utilities.strip_punctuation(origw) if i[1]=='JJ' : JJr = random.choice(JJ) # # JJr = re.sub('(' + '|'.join(import_utilities.chars.keys()) + ')', import_utilities.replace_chars, JJr) # JJr = import_utilities.strip_punctuation(JJr) JJr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],JJr.lstrip().lstrip()) if i[0].istitle(): JJr = JJr.title() poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', JJr, poem_replaced,1)#poem_replaced.replace(i[0],JJr,1) replaced_ls.append(JJr) if i[1]=='RB': RBr = random.choice(RB) RBr = import_utilities.moveBeginAndEndPunctuationFromStrToString(i[0],RBr.lstrip().lstrip()) if i[0].istitle(): RBr = RBr.title() poem_replaced = re.sub(r'\b' + import_utilities.strip_punctuation(i[0]) + r'\b', RBr, poem_replaced,1) replaced_ls.append(RBr) ######################## # IS IT ENGLISH? # ######################## for line in poem_replaced.split('\n\r'): if len(line)>0 : if "english" not in import_utilities.get_language(line): quit_language+=1 #print "NOT english:",quit_language,line else: quit_language-=1 ######################### # SYNSET REPLACE # ######################### for idx,word in enumerate(poem_replaced.split(' ')): if "<br>" not in word and "	" not in word and len(word)>0 and "~~~~!~~~" not in word: words_total+=1 ######################### # PRONOUN ' VERB # ######################### if len(word.split("'"))>1: if word.split("'")[0] in personal_pronouns: replacement_word = random.choice(personal_pronouns)+"'"+word.split("'")[1]+' ' poem_replaced.replace(word,replacement_word) #print "word,",word,"replacement_word:",replacement_word #################################################### # Replacement of OTHERs # #################################################### elif not word.lower().strip(" \n\t\r") in stopwords.words('english'): # take off leading brackets, commas etc... word_punct_nopunct = import_utilities.strip_punctuation_bool(word) word_nopunct = word_punct_nopunct['word'].strip(" \n\t\r") word_punct = word_punct_nopunct['punct'] punct_bool = word_punct_nopunct['punct_bool'] ####################################################### # MAIN EXCHANGE PROCESS CALL >>>>>>> GET THE SYNSET # ####################################################### if word_nopunct[-4:].lower()=="here": similarterm=random.choice(import_utilities.heres) else: #print "WORD:",word_nopunct if len(word_nopunct)>2: similarterm = import_utilities.find_synset_word(word_nopunct)#(word.lstrip().rstrip()) ############################################ # manually get rid of some terrible choices ############################################ if similarterm == "ilk": ##print "like" similarterm = "like" if similarterm == "ope": ##print "doth" similarterm = "does" if similarterm == "information technology": ##print "doth" similarterm = "it" if similarterm == "Nox": ##print "doth" similarterm = "dark" ####################################### # abbreviations for f*****g states! # ####################################### if word_nopunct.upper() in import_utilities.state_abbrev and word_nopunct.lower() not in stopwords.words('english') and "me," not in word: tmp = similarterm if word_nopunct == "oh": similarterm = random.choice(import_utilities.exclaims) else: similarterm = random.choice(RESERVOIR) #print word_nopunct," replaced by", tmp, "replaced with:",similarterm, "in:",line ############## # hyphenated # ############## hyp =word.split("-") #print word,len(hyp) if len(hyp) >1: similarterm="" for w in hyp: if len(w) > 2: similarterm += import_utilities.find_synset_word(w)+"-" similarterm = import_utilities.strip_underscore(similarterm[:-1]) #print "hyphenated:",word,"replaced by: "+similarterm ######################################################### # is it a TRUNCATED VERB slang as in singin or wishin # ######################################################### if similarterm == word_nopunct and len(word)>2 and 'in' in word_nopunct[-2:]: similarterm = import_utilities.find_synset_word(word_nopunct+'g') ## #print "TRUNCATED SLANG word: '"+word+"'",similarterm interim = import_utilities.lemma(similarterm) ## #print interim similarterm = import_utilities.conjugate(interim, tense=import_utilities.PARTICIPLE, parse=True)[:-1] # # # #print word,"widx:",widx," line_pos_tags[widx][0]:",line_pos_tags[widx][0]," line_pos_tags[widx][1]:",line_pos_tags[widx][1] ################# # SWEAR WORD # ################# ##print "at the garden of if:", word if word_nopunct in import_utilities.curses: similarterm = random.choice(import_utilities.curses) ##print "SWEAR WORD word: '"+word+"'",similarterm if len(hyp) >1: replacement_word = similarterm else: replacement_word = word.replace(word_nopunct, similarterm) replacement_word = import_utilities.strip_underscore(replacement_word) replacement_word = import_utilities.replaceNumbers(replacement_word) ######################### # RESERVOIR_OF_WEIRDNESS # ######################### if word_nopunct.lower() in import_utilities.impera: replacement_word=random.choice(import_utilities.impera) #print word,"IMPERA:",replacement_word elif word_nopunct.lower() in import_utilities.conjuncts: replacement_word=random.choice(import_utilities.conjuncts) #print word," CONJUNCTION replaced with",replacement_word elif word_nopunct.lower() in import_utilities.indef_prono: replacement_word=random.choice(import_utilities.indef_prono) #print word," INDEF_prono replaced with",replacement_word elif word_nopunct.lower() in import_utilities.prepo: replacement_word=random.choice(import_utilities.prepo) #print word," prepo replaced with",replacement_word elif word_nopunct.lower() in import_utilities.rel_prono: replacement_word=word #print word," rel_prono LEAVE alone: ",replacement_word elif word_nopunct.lower()[-2:] =="ly": replacement_word=import_utilities.strip_underscore(import_utilities.find_synset_word(word))#(word[:-2]) #print word," ADVERB: ",replacement_word # if replacement_word[-2:] !="ly": # replacement_word +="ly" else: if len(hyp) <2 and "like" not in word_nopunct and import_utilities.singularize(word_nopunct) == import_utilities.singularize(replacement_word) and word_nopunct.lower() not in import_utilities.stopwords_ls: if word_nopunct not in RESERVOIR and quit_language<0 and import_utilities.countPunctuation(word)<1 and len(word_nopunct)>3 and not word_nopunct.istitle(): #print "ADDING",word,"to reservoir" RESERVOIR.append(word) replacement_word = random.choice(RESERVOIR) #print word_nopunct,"replaced from reservoir with", replacement_word # print "'"+word_nopunct+"' vs RESERVOIR replacement_word:",replacement_word #," new_line:",new_line if quit_language>1 and not word_nopunct.istitle(): #print quit_language, "Probably foreign language: make a word salad in english" replacement_word = random.choice(RESERVOIR) #print word_nopunct,"OTHER replaced from reservoir with", replacement_word # REPLACEMENT poem_ls = poem_replaced.split(' ') idx = poem_ls.index(word) # #print idx,",", poem_ls[idx],",", word ,",",replacement_word if poem_ls[idx]==word: poem_ls[idx]=replacement_word poem_replaced = " ".join(poem_ls) #poem_replaced = poem_replaced.replace(word,replacement_word) # CORRECT the "A" to "An" for idx,word in enumerate(poem_replaced.split(" ")): # poem_replaced = poem_replaced+"A organism" if len(word)>0 and word[0].lower() in the_vowels and poem_replaced.split(" ")[idx-1].lower() =="a" : if poem_replaced.split(" ")[idx-1] =="a": old_str = "a "+poem_replaced.split(" ")[idx] new_str = "an "+poem_replaced.split(" ")[idx] else: old_str = "A "+poem_replaced.split(" ")[idx] new_str = "An "+poem_replaced.split(" ")[idx] poem_replaced = poem_replaced.replace(old_str,new_str) # poem_replaced = poem_replaced+"An consonant" if len(word)>0 and word[0].lower() not in the_vowels and poem_replaced.split(" ")[idx-1].lower() =="an" : if poem_replaced.split(" ")[idx-1] =="an": old_str = "an "+poem_replaced.split(" ")[idx] new_str = "a "+poem_replaced.split(" ")[idx] else: old_str = "An "+poem_replaced.split(" ")[idx] new_str = "A "+poem_replaced.split(" ")[idx] poem_replaced = poem_replaced.replace(old_str,new_str) #print "FOUND correction needed",old_str,new_str ######################### # WRITE SINGLE POEM # ######################### tmp_poem="" # poem_replaced.replace("\t","	") # poem_replaced.replace("\n"," <br>") # poem_replaced.replace("\r"," <br>") HTML_poem="" for line in poem_replaced.split("\n"): lines_total+=1 #print "LINE", line HTML_poem += line+"<br>" if len(response) >0 and len(id.split("_"))>1: # ALL_poems = ALL_poems_intro + " ".join(i for i in ALL_poems.split("</h2>.")[0:])+"<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>"+ author+"</b>, <i>"+ title +"</i> ]<br><br><b>"+new_title+"<br><br></b>"+HTML_poem ALL_poems += "<br><br>~~~~~~~~~~~~~~~~~~~~~~~~~~<br>[ A poem generated from template : <b>"+ author+"</b>, <i>"+ title +"</i> ]<br><br><b>"+new_title+"<br><br></b>"+HTML_poem tmp_poem= "[A poem generated from template: "+ author+", '"+ title +"'']\n\n'"+new_title+"'\nby\n"+new_author+"\n\n"+poem_replaced ##################### # # # # # PAUSE IT # # # # # ##################### # sleep_time=0.03*sub_cnt #sleep_time=30.03*sub_cnt #print "sub_cnt=",sub_cnt # ,"sleep_time=",sleep_time if (int(sub_cnt)%int(pause_every) == 0 and int(sub_cnt) !=0): time.sleep(int(sleep_time)) # if sub_cnt>=1: # raw_input("Press Enter to continue...") ##################### # # # # # PRINT # # # # # ##################### print "\n******\n"+tmp_poem txt_fn = id.split("_")[1]+"_POEMs.txt" # WRITE_BIO_PATH = DATA_DIR+"generated/POEMS/POEMS_"+datetime.datetime.now().strftime('%Y-%m-%d_%H')+"/" # if not os.path.exists(WRITE_BIO_PATH): # os.makedirs(WRITE_BIO_PATH) txt_fn_path = GENERATED_DIR+txt_fn f_txt=open(txt_fn_path,'w') f_txt.write(tmp_poem)#.encode('utf-8')) f_txt.close(); #print "\nTXT file created at:",txt_fn_path # ####### # # write them all.... wasteful... but useful if run is interrupted.... # ########### # ALL_poems = ALL_poems.replace("$$datetime$$",datetime.datetime.now().strftime('%Y-%m-%d at %H:%M')) # ALL_poems = ALL_poems.replace("$$cnt$$",str(cnt)) # print "cnt",cnt # ALL_poems = ALL_poems.replace("$$gentime$$",str(time.time() - start_time)) # # ALL POEMS # txt_fn = datetime.datetime.now().strftime('%Y-%m-%d_%H')+"_poetryFoundation_generatedPOEMS_"+type_of_run+".html" # txt_fn_path = DATA_DIR+"generated/POEMS/"+txt_fn # f_txt=open(txt_fn_path,'w') # f_txt.write(ALL_poems+"</hmtl>") # f_txt.close(); # print "\nTXT file created at:",txt_fn_path else: "~~~~~~~~~~~~~~~~!!!!!!!!!! EMPTY response:", author
VBN verb, past participle VBP verb, sing. present, non-3d VBZ verb, 3rd person sing. present WDT wh-determiner WP wh-pronoun WP$ possessive wh-pronoun WRB wh-abverb""" tag_to_text_dict = {} for line in tag_to_text.split("\n"): line = line.strip() tag_to_text_dict[line.split()[0]] = " ".join(line.split()[1:]) sentence = input("Enter your sentence : ") parts_of_speech = nltk.pos_tag(nltk.word_tokenize(sentence)) print("\nThe parts of speech in your sentence are : \n") for tup in parts_of_speech: if(tup[0]!='.' and tup[0]!=','): print(tup[0] + " : " + tag_to_text_dict[tup[1]]) print("\nEnter a word you would like synonyms and antonyms for : ") word = Word(input()) synonyms = list(set([l.name() for syn in word.get_synsets() for l in syn.lemmas()])) antonyms = list(set([ant.name() for syn in word.get_synsets() for l in syn.lemmas() for ant in l.antonyms()])) print(f" Synonyms : "+",".join(synonyms)) print(f" Antonyms : "+",".join(antonyms))
import openpyxl wb = openpyxl.load_workbook('training.xlsx') ws = wb.get_sheet_by_name('training_set') import nltk from nltk import word_tokenize, pos_tag for i in range(2, 1785): count = 0 f_essay = ws.cell(row=i, column=3) essay = f_essay.value import re letters_only = re.sub("[^a-zA-Z]", " ", essay) essay = letters_only.lower() x = nltk.word_tokenize(essay) y = nltk.pos_tag(x) nouns = [word for word, pos in y if pos == 'JJ'] count = len(nouns) ws.cell(row=i, column=10).value = count wb.save('training.xlsx')
abnormal_lst.add(m.group(0)) for i in range(1, 4): split_lst.append(m.group(i)) for t in token_sets[j]: if t not in abnormal_lst: all_tokens.append(t) else: all_tokens.extend(split_lst) token_sets[j] = all_tokens for i in range(len(token_sets[j])): if token_sets[j][i] not in Specific_NN: token_sets[j][i] = token_sets[j][i].lower() pos_tagged_tokens = [nltk.pos_tag(ts) for ts in token_sets] for i in range(len(pos_tagged_tokens)): for j in range(len(pos_tagged_tokens[i])): if pos_tagged_tokens[i][j][0] in words2pos: pos_tagged_tokens[i][j] = list(pos_tagged_tokens[i][j]) pos_tagged_tokens[i][j][1] = pos_tagged_tokens[i][j][0] pos_tagged_tokens[i][j] = tuple(pos_tagged_tokens[i][j]) interaction_collections = [] # MD + TO + VB + NN for ts in pos_tagged_tokens: target_pos = ['MD', 'TO', 'VB', 'NN'] interactions = extract_simple_sequences(ts, target_pos) if len(interactions) > 0:
def text_tokens_2( text, lower_bound_percentage=0, higher_bound_percentage=1, minimal_word_length=0, remove_punctuations=False, remove_non_letter_characters=False, lemmatize_the_words=False, stemmer_the_words=False, part_of_speech_filter=False, english_text_filter=False, stop_words_filter=False, other_words_filter=False, remove_adjacent_tokens=False, tokens_form=True, stop_words=stop_words, some_other_words=some_other_words ): text = text.lower() if remove_punctuations: text = text.translate(str.maketrans('', '', string.punctuation)) if remove_non_letter_characters: text = re.sub(r'[^a-zA-Z]', " ", text) tokens = nltk.word_tokenize(text) howmany_tokens = len(tokens) tokens = tokens[int(howmany_tokens * lower_bound_percentage):int(ceil(howmany_tokens * higher_bound_percentage))] if part_of_speech_filter: token_pos = nltk.pos_tag(tokens) tokens = [word for (word, pos) in token_pos if pos.startswith('N') or pos.startswith('J')] if english_text_filter: tokens = [token for token in tokens if token in Englishtext] if lemmatize_the_words: tokens = [lemmatizer().lemmatize(token) for token in tokens] stop_words = set([lemmatizer().lemmatize(word) for word in stopwords.words('english')]) some_other_words = set([lemmatizer().lemmatize(word) for word in some_other_words]) if stemmer_the_words: tokens = [SnowballStemmer_().stem(token) for token in tokens] stop_words = set([SnowballStemmer_().stem(word) for word in stopwords.words('english')]) some_other_words = set([SnowballStemmer_().stem(word) for word in some_other_words]) tokens = [token for token in tokens if len(token) >= minimal_word_length] if other_words_filter: tokens = [token for token in tokens if token not in some_other_words] p = nltk.pos_tag(tokens) grammar = r""" NP: {(<DT>|<JJ>*)<NN.*>+(<CC><NN.*>+)?} # noun phrase chunks VP: {<TO>?<VB.*>} # verb phrase chunks PP: {<IN>} # prepositional phrase chunks CLAUSE: {<VP>?<NP>+} """ cp = nltk.RegexpParser(grammar) if p: result = cp.parse(p) tree = result.subtrees() goodones = [] badones = [] for sub in tree: if sub.label() == 'CLAUSE': if len(list(sub)) >= 3: goodones.append(sub) else: badones.append(sub) tokens = [] if goodones: for g in goodones: for w, po in g.leaves(): tokens.append(w) else: for b in badones: for w, po in b.leaves(): tokens.append(w) if stop_words_filter: tokens = [token for token in tokens if token not in stop_words] if remove_adjacent_tokens: remove_adjacent(tokens) if tokens_form: return tokens else: return ' '.join(tokens) else: return []
print(lStem.stem(str(i))) #SnowBallStemmer sStem = SnowballStemmer('english') print("SnowBall Stemming : \n") for i in tokens[0:50]: print(sStem.stem(str(i))) #PorterStemmer pStem = PorterStemmer() print("Porter Stemming : \n") for i in tokens[0:50]: print(pStem.stem(str(i))) # POS-tagging print("Part of Speech Tagging :\n", pos_tag(word_tokenize(text))) # Lemmatization lemmatizer = WordNetLemmatizer() print("Lemmatization :\n") for tok in tokens[0:50]: print(lemmatizer.lemmatize(str(tok))) # Trigram print("Trigrams :\n") trigram = [] for x in tokens[0:20]: trigram.append(list(ngrams(x, 3))) print(trigram) # Named Entity Recognition
def text_tokens( text, lower_bound_percentage=0, higher_bound_percentage=1, minimal_word_length=0, lower_case=False, remove_punctuations=False, remove_non_letter_characters=False, lemmatize_the_words=False, stemmer_the_words=False, add_pos_feature=False, url_filter=False, parentheses_filter=False, prime_s_filter=False, number_filter=False, part_of_speech_filter=False, english_text_filter=False, stop_words_filter=False, other_words_filter=False, remove_adjacent_tokens=False, tokens_form=True, stop_words=stop_words, some_other_words=some_other_words ): if lower_case: text = text.lower() #Englishtext = set(w.lower() for w in W.words()) text = re.sub(r'\n', "", text) if url_filter: url_pattern = re.compile( r'((http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-zA-Z0-9]+([\-\.]{1}[a-zA-Z0-9]+)*\.[a-zA-Z]{2,5}(:[0-9]{1,5})?(\/.*)?)' ) text = re.sub(url_pattern, " ", text) if parentheses_filter: parentheses_pattern = re.compile(r'(\([^)]+\))') text = re.sub(parentheses_pattern, " ", text) if prime_s_filter: prime_s_pattern = r"('s|\?s)" text = re.sub(prime_s_pattern, "", text) if remove_punctuations: text = text.translate(str.maketrans('', '', string.punctuation)) if remove_non_letter_characters: text = re.sub(r'[^a-zA-Z0-9]', " ", text) if number_filter: text = re.sub(r'[0-9]', " ", text) tokens = nltk.word_tokenize(text) howmany_tokens = len(tokens) if stop_words_filter: tokens = [token for token in tokens if token not in stop_words] tokens = tokens[int(howmany_tokens * lower_bound_percentage):int(ceil(howmany_tokens * higher_bound_percentage))] if part_of_speech_filter: token_pos = nltk.pos_tag(tokens) tokens = [word for (word, pos) in token_pos if pos.startswith('N')] if add_pos_feature: token_pos = nltk.pos_tag(tokens) tokens = [word + '_' + pos for (word, pos) in token_pos] if english_text_filter: if add_pos_feature: tokens = [token for token in tokens if token.split('_')[0] in Englishtext] else: tokens = [token for token in tokens if token in Englishtext] if lemmatize_the_words: if add_pos_feature: tokens = [ lemmatizer().lemmatize(token.split('_')[0]) + '_' + token.split('_')[1] for token in tokens ] else: tokens = [lemmatizer().lemmatize(token) for token in tokens] #stop_words = set([lemmatizer().lemmatize(word) for word in stopwords.words('english')]) some_other_words = set([lemmatizer().lemmatize(word) for word in some_other_words]) if stemmer_the_words: if add_pos_feature: tokens = [ SnowballStemmer_().stem(token.split('_')[0]) + '_' + token.split('_')[1] for token in tokens ] else: tokens = [SnowballStemmer_().stem(token) for token in tokens] #stop_words = set([SnowballStemmer_().stem(word) for word in stopwords.words('english')]) some_other_words = set([SnowballStemmer_().stem(word) for word in some_other_words]) if add_pos_feature: tokens = [token for token in tokens if len(token.split('_')[0]) >= minimal_word_length] else: tokens = [token for token in tokens if len(token) >= minimal_word_length] if other_words_filter: if add_pos_feature: tokens = [token for token in tokens if token.split('_')[0] not in some_other_words] else: tokens = [token for token in tokens if token not in some_other_words] if remove_adjacent_tokens: remove_adjacent(tokens) if tokens_form: return tokens else: return ' '.join(tokens)
import nltk from nltk import word_tokenize, pos_tag, ne_chunk # ===== POS Tagging and NER using NLTK ===== sent = '''Professor Tan Eng Chye, NUS Deputy President and Provost, and Professor Menahem Ben-Sasson, President of HUJ signed the joint degree agreement at NUS, in the presence of Ambassador of Israel to Singapore Her Excellency Amira Arnon and about 30 invited guests, on Sept 25, 2013. ''' # The input for POS tagger needs to be tokenized first. sent_pos = pos_tag(word_tokenize(sent)) sent_pos # ===== NER using NLTK ===== # The input for the NE chunker needs to have POS tags. sent_chunk = ne_chunk(sent_pos) print(sent_chunk) # ===== Now try creating your own named entity and noun phrase chunker ==== # We need to define the tag patterns to capture the target phrases and use # RegexParser to chunk the input with those patterns. # Some minimal tag patterns are given here. grammar = r""" NE: {<NNP>+(<IN|CC|TO><NNP>)*} # chunk sequences of proper nouns NP: {<DT|CD><JJ>?<NNS|NN>} """