def make_pos(target_tag, edit_rev): tags, srcs, dsts = edit_rev_triple # target_tag: 文中に存在する # 品詞を付与する前に、文中から削除・追加タグが存在する部分を取り除く if target_tag == del_tag: sentence = dsts elif target_tag == add_tag: sentence = srcs if target_tag in tags: tag_indexes = [i for i, x in enumerate(tags) if x == target_tag] trimed = sentence for tag_index in tag_indexes: trimed = trimed[:tag_index] + trimed[tag_index+1:] posed = pos_tag(trimed) pos = [w[1] for w in posed] for tag_index in tag_indexes: pos.insert(tag_index, u'') #debug None_indexes = [i for i, x in enumerate(pos) if x == u''] if tag_indexes != None_indexes: print >>sys.stderr, tag_indexes print >>sys.stderr, None_indexes print >>sys.stderr, tags print >>sys.stderr, pos else: posed = pos_tag(u' '.join(sentence).split()) pos = [w[1] for w in posed] return pos
def number_of_exact_word_match(a, b, word_tokenizer, lemmatizer, stop_words): pos_a = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(a))) pos_b = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(b))) lemmae_a = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_a \ if token.lower().strip(punctuation) not in stop_words] lemmae_b = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_b \ if token.lower().strip(punctuation) not in stop_words] matched_words = set(lemmae_a).intersection(lemmae_b) return [len(matched_words), matched_words, b]
def number_of_noun_match(a, b, word_tokenizer, lemmatizer, stop_words): pos_a = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(a))) pos_b = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(b))) lemmae_a = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_a \ if pos == NOUN and token.lower().strip(punctuation) not in stop_words] lemmae_b = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_b \ if pos == NOUN and token.lower().strip(punctuation) not in stop_words] # Calculate Jaccard similarity #ratio = len(set(lemmae_a).intersection(lemmae_b)) / float(len(set(lemmae_a).union(lemmae_b))) #return (ratio > 0.66) matched_words = set(lemmae_a).intersection(lemmae_b) return [len(matched_words), matched_words, b]
def _process_simpleHash(self, simpleHash): # Extract entities from keys resulting from SimpleExtractor process_* entityHash = {} for data in simpleHash: occs = simpleHash[data]['occurences'] proxLoc = simpleHash[data]['proxLoc'] # Tokenize sentences for sent in tokenize_sentences(data): # Tokenize words tokens = tokenize_words(sent) # Tag words with Parts of Speech tagged = pos_tag(tokens) # Identify named entities entities = ne_chunk(tagged) for ent in entities: if isinstance(ent, NLTKParseTree): # Is it a wanted type? if ent.node in self.types: # Should we keep the PoS tag? if self.keepPos: txts = ['/'.join(token) for token in ent.leaves()] else: txts = [token[0] for token in ent.leaves()] txt = ' '.join(txts) new = {txt: {'text': txt, 'occurences': occs, 'proxLoc': proxLoc[:]}} entityHash = self._mergeHash(entityHash, new) return entityHash
def parse(body): contents = [] if isinstance(body, basestring): contents.append(body) else: contents = body sentences = [] for content in contents: sentences.extend([sentence for sentence in sent_tokenize(content) if not str_helper.hasHTMLTag(sentence)]) stop = stopword.get_stopwords() tokens = {} for sentence in sentences: for word in word_tokenize(sentence.lower()): if word not in stop and not str_helper.hasNumbers(word) and not str_helper.hasPunctuation(word): word = stem.stemming(word) tokens.setdefault(word, 0) tokens[word] += 1 wp = pos_tag(tokens.keys()) words = [row[0] for row in wp] tags = [row[1] for row in wp] return words, tags
def extract_onlynouns(tokens): out = list() for token in tokens: pos = pos_tag(nltk.word_tokenize(token.lower()))[0][1] if (pos == "NN") or (pos == "NNP"): out.append(token) return out
def get_final_query_with_tokens(self, query): """ Get final query with keywords selected and processed Parameters ---------- query : string, mandatory The initial query """ result = {}; result['query'] = query words = pos_tag(nltk.word_tokenize(query)) #grammar = "NP: {<DT>?<JJ>*<NN>}" #cp = nltk.RegexpParser(grammar) #tree = cp.parse(words); #print(tree) #NPs = list(tree.subtrees(filter=lambda x: x.label()=='NP' or x.label().startswith('NN') or x.label()=='WP')) #print [' '.join(NP.leaves()[0]) for NP in NPs ] #print [(word, pos) for word,pos in words if pos.startsWith('NN') or pos == 'WP'] result['select'] = self.get_select_tokens(words) # return query and required tokens return result #qs = QueryTokenizer() #inputs = ['s', 'what type of persons travels ?', 'who are travelling ?', 'what is the tour schedule ?', 'what is the cost for a tour?'] #for q in inputs: # print qs.get_final_query_with_tokens(q)
def main(): wsjsubset = open("../corpus/wsjsubset", 'r').readlines() genia = open("../corpus/genia", 'r').readlines() words = [] postag = [] chunktag = [] for line in wsjsubset: if len(line.split()) > 0: words.append(line.split()[0]) postag.append(line.split()[1]) chunktag.append(line.split()[2]) postag_nltk = pos_tag(words) hits = 0 fails = {} for i in xrange(len(postag_nltk)): if postag_nltk[i][1] == postag[i]: hits += 1 else: fails[(postag[i], postag_nltk[i][1])] = fails.get((postag[i], postag_nltk[i][1]), 0) + 1 accuracy = hits/float(len(postag)) for fail in fails: fails[fail] = fails[fail] / float(len(postag) - hits) for key, value in fails.iteritems(): print value, key
def extract_entities(words): entities = [] for chunk in ne_chunk(pos_tag(words)): if hasattr(chunk, 'node'): performer = ' '.join(c[0] for c in chunk.leaves()) entities.append(performer.lower()) return entities
def tagging_ranks(theList): importantWords = [] upTo = len(theList) for i in range(0, upTo): print theList[i][0] poS = tag.pos_tag(tokenize.word_tokenize(theList[i][0])) print poS theList[i].append(poS[0][1]) if theList[i][1]<4: break print theList for words in theList: if len(words) > 2: if words[2].startswith(('JJ', 'NN', 'RB', 'VB')): importantWords.append(words) print importantWords return "Passed" pass
def process_raw_text(text): """ First some code to standardize the formatting, then basic nlp. """ # Remove breaks and tabs for char in ["\t", "\n"]: text = text.replace(char, " ") text = text.replace('."', '".') text = text.replace(".'", "'.") # Split special characters from words for char in ["'", '"', ",", ".", "?", "!", ";", ":"]: text = text.replace(char, " " + char + " ") # Magic to remove all multi-spaces text = ' '.join(text.split()) # get the words, sentences, POS tags, and chunks. chunks = [ tuple([ c.type for c in t.chunks ]) for t in parsetree(text) ] sentences = sent_tokenize(text) sentences = [ word_tokenize(s) for s in sentences ] sentences_tags = [ tuple([ (w, simplify_tag(t)) for w, t in pos_tag(s) ]) for s in sentences ] sentences = [ tuple([ w for w, _ in s]) for s in sentences_tags ] tags = [ tuple([ t for _, t in s]) for s in sentences_tags ] words = flatten(sentences) return tuple(words), tuple(sentences), tuple(tags), tuple(chunks)
def title_permutations(title_expanded): title_tagged = pos_tag(title_expanded.split()) st = PorterStemmer() title_pos = [st.stem(word) for word, pos in title_tagged if pos != 'IN'] title_perms = list(map("*".join, permutations(title_pos))) return title_perms
def processoFeatures(resposta): frases = tokenizerFrases.tokenize(resposta["corpo"]) palavras = [] palavrasTexto = {} for frase in frases: palavrasTemp = tokenizerPalavras.tokenize(frase) for palavra in palavrasTemp: palavrasTexto[palavra] = True posTags = pos_tag(palavras) positivo = 0 negativo = 0 for palavra, tag in posTags: synsets = None if tag.startswith("J"): synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADJ) elif tag.startswith("V"): synsets = sentiwordnet.senti_synsets(palavra, wordnet.VERB) elif tag.startswith("N"): synsets = sentiwordnet.senti_synsets(palavra, wordnet.NOUN) elif tag.startswith("R"): synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADV) else: synsets = sentiwordnet.senti_synsets(palavra, "") if synsets != None: synsets = list(synsets) if len(synsets) > 0: synset = synsets[0] positivo = positivo + synset.pos_score() negativo = negativo + synset.neg_score() if positivo > negativo: return (palavrasTexto, "positivo") elif negativo > positivo: return (palavrasTexto, "negativo") else: return (palavrasTexto, "neutro")
def GetContractPage(x): url = 'http://www.defense.gov/contracts/contract.aspx?contractid=%d' % x html = urllib.urlopen(url).read() if re.search("The Official Home of the Department of Defense", html): return soup = BeautifulSoup(html) p_tags = soup.findAll("p") p_tags_text_list = [tag.text for tag in p_tags] tokenized_list = [] for text in p_tags_text_list: tokenized_list = tokenize.word_tokenize(text) tokenized_list.append(nltk_tag.pos_tag(tokenized_list)) tagged_list = tokenized_list[-1] data = { "url": url} for token in tagged_list[1:]: if token[1]=="NNP": data['entity'] = token[0] break for token in tagged_list[1:]: if token[1]=="CD": data['Amount'] = token[0] break print data
def test_run(): results = {} nouns = [] product_list = {} for p in Post.query.all(): tagged_sent = pos_tag(p.story.split()) propernouns = [word for word,pos in tagged_sent if pos == 'NNP'] for n in propernouns: if n == "I’m" or n == "It’s" or n == "Can’t": continue results[n.replace('.', '')] = True for r in results.keys(): nouns.append(r) for i in range(10): noun = random.choice(nouns) # print('Using "%s"', (noun,)) for k in test_keywords: try: products = amazon.search(Keywords=noun, SearchIndex=k) for product in products: product_list[product.title] = True except: continue for p in product_list.keys(): print(" Found title: %s" % (p,))
def count_words_unigram_pos(input_filename, output_path=''): txt = get_file_text(input_filename) word_regex = '[a-zA-Z]+' word_frequency = {} total_words = 0. matches = re.findall(word_regex, txt, re.M + re.S + re.U) for m in matches: word_frequency[m] = word_frequency.get(m, 0.) + 1. total_words+=1. sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1)) word_analysis = [] for word in sorted_words: pos = pos_tag([word[0]]) word_analysis.append([word[0], word[1], pos[0][1]]) o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words_unigram_pos') o_file.write('word\tcount\tpos\n') for w in word_analysis: o_file.write('%s\t%d\t%s\n' % (w[0], w[1], w[2])) o_file.close()
def generate_searches(posts): results = {} nouns = [] for p in posts: tagged_sent = pos_tag(p.story.split()) propernouns = [] last_noun = False for word,pos in tagged_sent: if pos == 'NNP': if last_noun: propernouns[-1] = propernouns[-1] + ' ' + word else: propernouns.append(word) last_noun = True else: last_noun = False for n in propernouns: if n == "I’m" or n == "It’s" or n == "Can’t": continue results[n.replace('.', '')] = True for r in results.keys(): nouns.append(r) return nouns
def lda_train(raw): stop = set(stopwords.words('english')) p_stemmer = PorterStemmer() text_array = [] for i in range(len(raw)): text = raw[i].lower() text = text.replace('\r\n', ' ') text = re.sub("[^a-z0-9]", " ", text) # Tokenization segments a document into its atomic elements. words = text.split() # Stop words # Certain parts of English speech, like (for, or) or the word the are meaningless to a topic model. # These terms are called stop words and need to be removed from our token list. words = [j for j in words if j not in stop] tokenized = nltk.word_tokenize(text) tagged_sent = pos_tag(words) words = [word for word,pos in tagged_sent if pos == 'NN'] # Stemming words is another common NLP technique to reduce topically similar words to their root. # stemming reduces those terms to stem. This is important for topic modeling, which would otherwise view those terms as separate entities and reduce their importance in the model. #words = [p_stemmer.stem(s) for s in words] text_array.append(words) dictionary = corpora.Dictionary(text_array) dictionary.save('dictionary.dic') corpus = [dictionary.doc2bow(text) for text in text_array] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ldamodel = models.ldamodel.LdaModel(corpus, num_topics=15, id2word=dictionary, passes=20) filename = 'finalized_model_15.sav' joblib.dump(ldamodel, filename) print(ldamodel.print_topics(num_topics=15, num_words=6)) return ldamodel,dictionary
def filter_by_pos(sentence, pos): """Returns the sentence with only words whose part of speech is in ACCEPTED_POS. """ words_with_pos = pos_tag(word_tokenize(sentence)) words_with_pos = filter(lambda word: word[1] in pos, words_with_pos) return ' '.join(map(lambda word_with_pos: word_with_pos[0], words_with_pos))
def keep_nouns(tf): n_tf = {} for k in tf: if pos_tag([k])[0][1].find('N') == 0: n_tf[k] = tf[k] return n_tf
def extract_tags(comment_file): result = {} fd = open(comment_file, "r") for s in fd: m = s.replace(",",".").replace("and", ".").replace("or",".").replace(":",".").split(".") for f in m: d = wordpunct_tokenize(f) for index, t in enumerate(d): pos_str = "" if t in cellphone_attribute: before = index-10 if before < 0: before = 0 end = index + 10 if end > len(d)-1: end = len(d)-1 pos_result = pos_tag(d[before:end]) for pos_index, pos_sent in enumerate(pos_result): seg_for_word = "" adjust_word = "" if pos_sent[1].find("JJ") != -1: seg_for_word = ' '.join(d[index:pos_index + before + 1]) adjust_word = pos_sent[0] if pos_index+ before < index: seg_for_word = ' '.join(d[pos_index+before:index+1]) add_into_dict(result, t, adjust_word, seg_for_word) return result
def _get_nn(self, sentence): '''get proper nouns''' tagged_sent = pos_tag(sentence) propernouns = [word for word,pos in tagged_sent if pos == 'NN'] regex = re.compile('[^a-zA-Z]') propernouns = [regex.sub('', i) for i in propernouns] return propernouns
def extractNounListFromTweetFile(filePath, fileName): fileTarget = open(filePath + "\\" + fileName, 'r') fileName = "processedTweets.txt" processedTweetsFileTarget = open(filePath + "\\" + fileName , 'w') allNouns = [] i = 0 for line in fileTarget: lineContent = line.split("\t") userId = lineContent[0] tweet = lineContent[1] #latitude = int(lineContent[2]) #longitude = int(lineContent[3].strip("\n")) processedTweet, hashTags = preprocessTweet(tweet) taggedTweet = pos_tag(processedTweet.split()) nounsInTweet = [word for word, pos in taggedTweet if pos == 'NNP' or pos == 'NN'] allNouns.extend(nounsInTweet) processedTweetsFileTarget.write(str(i) + "\t" + userId + "\t" + processedTweet + "\t" + str(nounsInTweet) + "\t" + str(hashTags) + "\n") i += 1 if i == 2000: break fileTarget.close() processedTweetsFileTarget.close() return allNouns
def extract(query): sentence = query tagged_sent = pos_tag(sentence.split()) propernouns = [word for word,pos in tagged_sent if pos == 'NN'] return propernouns #extract("I want to buy a car and a dog and plane")
def extract_pos(tokens, simple=True): """ Simple parts of speech of speech are: VERB - verbs (all tenses and modes) NOUN - nouns (common and proper) PRON - pronouns ADJ - adjectives ADV - adverbs ADP - adpositions (prepositions and postpositions) CONJ - conjunctions DET - determiners NUM - cardinal numbers PRT - particles or other function words X - other: foreign words, typos, abbreviations . - punctuation :param tokens: :return: """ tokens_pos = pos_tag(tokens) pos = [p for t, p in tokens_pos] if simple: # translate larger set of part of speech tags into small, simpler set pos_dict = nltk.tagset_mapping('en-ptb', 'universal') pos = [pos_dict[p] for p in pos] return pos
def analiseSentimento(resposta): texto = resposta['corpo'] frases = sentencesTokenizer.tokenize(texto) palavras = [] for frase in frases: palavras.extend(wordsTokenizer.tokenize(frase)) posTags = pos_tag(palavras) positivo = 0 negativo = 0 for palavra, tag in posTags: synsets = None if tag.startswith('J'): synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADJ) elif tag.startswith('V'): synsets = sentiwordnet.senti_synsets(palavra, wordnet.VERB) elif tag.startswith('N'): synsets = sentiwordnet.senti_synsets(palavra, wordnet.NOUN) elif tag.startswith('R'): synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADV) else: synsets = sentiwordnet.senti_synsets(palavra, '') if synsets != None: synsets = list(synsets) if len(synsets) > 0: synset = synsets[0] positivo = positivo + synset.pos_score() negativo = negativo + synset.neg_score() if positivo > negativo: return (resposta, 'positivo') elif negativo > positivo: return (resposta, 'negativo') else: return (resposta, 'neutro')
def __init__(self, sentence): assert type(sentence) == str assert len(sentence) > 0 self.raw_text = sentence.strip() self.lower = sentence.lower() self.normalised = sentence.replace('"', '``').strip() self.tokens, self.postags = zip(*pos_tag(word_tokenize(sentence)))
def returnPOSTaggedWords(text): output={"CC":0,"CD":0,"DT":0,"EX":0,"FW":0,"IN":0,"JJ":0,"JJR":0,"JJS":0,"LS":0,"MD":0,"NN":0,"NNP":0,"NNPS":0,"NNS":0,"PDT":0,"POS":0,"PRP":0,"PRP$":0,"RB":0,"RBR":0,"RBS":0,"RP":0,"SYM":0,"TO":0,"UH":0,"VB":0,"VBD":0,"VBG":0,"VBN":0,"VBP":0,"VBZ":0,"WDT":0,"WP":0,"WP$":0,"WRB":0,"#":0,"$":0,"''":0,"(":0,")":0,",":0,".":0,":":0,"''":0,"-NONE-":0,"``":0} tokens=wordpunct_tokenize(text) tagged=pos_tag(tokens) for word,pos in tagged: output[pos]=output[pos]+1 return output
def find_catalog(product_name,catalog_of_products): temp_catalog=list(catalog_of_products) tagged_text=pos_tag(product_name.split()) output=nltk.ne_chunk(tagged_text) for subtree in output.subtrees(filter=lambda t: t.label() == 'PERSON'): for leave in subtree.leaves(): temp_catalog.append(leave[0]) return temp_catalog
def get_mistake_nouns(self): mistakes_id = get_all_mistakes_id() for id in mistakes_id: phrase = get_mistake_noun(id) tagged_sent = pos_tag(phrase.split()) nouns = [word for word, pos in tagged_sent if pos[0] == 'N'] for noun in nouns: yield noun.strip('.')
# In[288]: ps = PorterStemmer() #initialize Porter Stemmer object ps_stems = [] for w in test_post_words: ps_stems.append(ps.stem(w)) print(' '.join(ps_stems)) # add all the stemmed words to one string # In[293]: #parts of speech tagging token_tag = pos_tag(test_post_words) token_tag[:10] # In[294]: def get_wordnet_pos(treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV
from nltk import word_tokenize from nltk.tag import pos_tag V = ['VB', 'VBZ', 'VBP', 'VBD', 'VBG'] N = ['NN', 'NNS', 'NNP', 'NNPS'] ADV = ['RB', 'RBR', 'RBS'] ADJ = ['JJ', 'JJR', 'JJS'] wLen = [] # number of words vLen = [] # number of verbs advLen = [] # number of adverbs adjLen = [] # number of adjectives vLen, nLen, advLen, adjLen, wLen = ([] for i in range(5)) for fileid in newcorpus.fileids(): tokens = word_tokenize(newcorpus.raw(fileid)) words = [t for t in tokens if t.isalpha()] taggedW = pos_tag(words) verbs, nouns, advs, adjs = ([] for i in range(4)) for (w,tag) in taggedW: if tag in V: verbs.append(w) elif tag in N: nouns.append(w) elif tag in ADV: advs.append(w) elif tag in ADJ: adjs.append(w) wLen.append(len(words)) vLen.append(len(verbs)) nLen.append(len(nouns)) advLen.append(len(advs)) adjLen.append(len(adjs)) plotData0 = [(wLen, vLen), (wLen, nLen), (wLen, adjLen)] yaxisLabels = ['V x 1000', 'N x 1000', 'ADJ x 1000']
# Named Entity Recognition # Importing libraries from nltk.tokenize import word_tokenize from nltk.tag import pos_tag from nltk.chunk import ne_chunk # Importing the data dataset = """Abraham Lincoln was an American statesman and lawyer who served as the 16th President of the United States""" # Tokenization and POS tagging dataset_tag = pos_tag(word_tokenize(dataset)) # Apply NER dataset_ner = ne_chunk(dataset_tag) print(dataset_ner) # Tree Diagram dataset_ner.draw()
# app = FlaskAPI(__name__) f=open('about2.txt','r',errors = 'ignore') raw=f.read() # raw=raw.lower() # nltk.download('punkt') # first-time use only # nltk.download('wordnet') # first-time use only # nltk.download('averaged_perceptron_tagger') #once sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences word_tokens = nltk.word_tokenize(raw)# converts to list of words propernouns = [word for word,pos in pos_tag(nltk.word_tokenize(raw)) if pos == 'NNP'] # print('propernouns') # print(propernouns) # print(word_tokens) # new_word_tokens = [] # for w in word_tokens: # if w in propernouns: # new_word_tokens.append(w) # else: # new_word_tokens.append(w.lower()) # word_tokens = new_word_tokens # print(word_tokens)
def _build_matrix(chunk=15): # daily_updown = DailyStock.objects.all().order_by("diff_yesterday") # daily_updown = daily_updown.values('year', 'month', 'date', 'diff_yesterday') # # minval = abs(min(map(itemgetter('diff_yesterday'), daily_updown))) # maxval = abs(max(map(itemgetter('diff_yesterday'), daily_updown))) # if minval > maxval: # daily_updown = [dict(d, diff_yesterday=( # d['diff_yesterday']) / minval) for d in daily_updown] # else: # daily_updown = [dict(d, diff_yesterday=( # d['diff_yesterday']) / maxval) for d in daily_updown] documents = [] up_days = list(DailyStock.objects.order_by('diff_yesterday')[:chunk]) down_days = list(DailyStock.objects.order_by('-diff_yesterday')[:chunk]) max_diff = max(up_days[0].diff_yesterday, abs(down_days[0].diff_yesterday)) daily_updown = up_days + down_days for daily in daily_updown: daily.diff_yesterday = daily.diff_yesterday / max_diff text_dict = {} days_text_list = [] for daily in daily_updown: # dateconversion print(daily) LINK = 'https://news.google.com/rss/search?q=samsung+electronics+when:{}-{:02d}-{:02d}&hl=en-US&gl=US&ceid=US:en'.format( daily.year, daily.month, daily.date, ) print(LINK) xmldoc = minidom.parse(urllib.request.urlopen(LINK, timeout=10)) itemlist = xmldoc.getElementsByTagName('item') today_text = '' # for newsitem in itemlist: doccount = 0 for items in itemlist: if doccount > 10: break print(items) singlelink = items.getElementsByTagName( 'link')[0].firstChild.nodeValue pubdate = items.getElementsByTagName( 'pubDate')[0].firstChild.nodeValue date = date_parser.parse(pubdate).strftime("%Y%m%d") try: response = get(singlelink, timeout=10) except Exception as e: print('here') pass else: try: extractor = Goose() article = extractor.extract(raw_html=response.content) text_str = article.cleaned_text today_text += text_str doccount += 1 except TypeError: print('this') pass days_text_list.append(today_text) DailyDocument.objects.create(doc=today_text, is_up=daily.diff_yesterday > 0) scripts = days_text_list lemmatizer = WordNetLemmatizer() for sen in range(0, len(scripts)): # Remove all the special characters document = re.sub(r'\W', ' ', str(scripts[sen])) # remove all single characters document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) # Remove single characters from the start document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) # Substituting multiple spaces with single space document = re.sub(r'\s+', ' ', document, flags=re.I) # Converting to Lowercase document = document.lower() # Lemmatization document = document.split() doc = pos_tag(document) final_doc = [] for i in range(len(doc)): if doc[i][1] in ['NN', 'NNP', 'NNS', 'NNPS']: document[i] = lemmatizer.lemmatize(document[i], 'n') final_doc.append(document[i]) elif doc[i][1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: document[i] = lemmatizer.lemmatize(document[i], 'v') final_doc.append(document[i]) elif doc[i][1] in ['JJ', 'JJR', 'JJS']: document[i] = lemmatizer.lemmatize(document[i], 'a') final_doc.append(document[i]) elif doc[i][1] in ['RB', 'RBR', 'RBS', 'RP']: document[i] = lemmatizer.lemmatize(document[i], 'r') final_doc.append(document[i]) #else: # final_doc.append(document[i]) pre_document = ' '.join(final_doc) documents.append(pre_document) vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r'(?u)\b[A-Za-z]+\b', max_df=0.9) bag_of_words = vectorizer.fit_transform(documents) value_word = pd.DataFrame(bag_of_words.toarray()).mul(list( map(lambda d: d.diff_yesterday, daily_updown)), axis=0).sum(axis=0) string_word = vectorizer.get_feature_names() word_dict = { string_word[i]: value_word[i] for i in range(len(string_word)) } with open('word_dict_pickle', 'wb') as f: pickle.dump(word_dict, f) return word_dict
# print(node_ids) edge_id = 0 for s,t,l in js: try: gjson.setdefault('edges',[]).append({'id': edge_id, 'from_id': node_ids[s], 'label': l, 'trg_id': node_ids[t], 'relxn': 'inferred'}) except Exception as e: print(t,s,t) print(str(e)) edge_id += 1 # print( # json.dumps(gjson, indent=2, sort_keys=False) # ) # https://stackoverflow.com/questions/17966554/in-python-nltk-i-am-trying-to-get-parts-of-speech-of-a-word-by-using-pos-tag-bu #https://becominghuman.ai/natural-language-processing-in-python-3-using-nltk-fd0ff4a0da9b for t in all_nodes: gjson.setdefault('nodes', []).append({'id': node_ids[t], 'term': t, 'pos': pos_tag([t])[0][1] }) print( json.dumps(gjson, indent=2, sort_keys=False) )
da['targetDescription_subjectivity'] = data['targetDescription'].apply(lambda x : TextBlob(str(x)).sentiment.subjectivity) da['targetParagraphs_polarity'] = data['targetParagraphs'].apply(lambda x : TextBlob(str(x)).sentiment.polarity) da['targetParagraphs_subjectivity'] = data['targetParagraphs'].apply(lambda x : TextBlob(str(x)).sentiment.subjectivity) da['targetTitle_polarity'] = data['targetTitle'].apply(lambda x : TextBlob(str(x)).sentiment.polarity) da['targetTitle_subjectivity'] = data['targetTitle'].apply(lambda x : TextBlob(str(x)).sentiment.subjectivity) # In[34]: pos_di = {} tagdict = load('help/tagsets/upenn_tagset.pickle') for pos in list(tagdict.keys()): pos_di[pos] = [] for snt in data['postText']: di = Counter([j for i,j in pos_tag(word_tokenize(snt))]) for pos in list(tagdict.keys()): pos_di[pos].append(di[pos]) da = pd.concat([da,pd.DataFrame(pos_di)], axis = 1) #number of stop words stp_wds = set(stopwords.words('english')) da['postText_number_of_stop_words'] = data['postText'].apply(lambda x: len(stp_wds.intersection(word_tokenize(str(x))))) #number of punctations da['postText_num_of_unique_punctuations'] = data['postText'].apply(lambda x : len(set(x).intersection(set(string.punctuation)))) # In[35]:
import nltk from nltk.tokenize import word_tokenize from nltk.tag import pos_tag from nltk.chunk import tree2conlltags from pprint import pprint # Read text file text = open( "Dataset/train.txt") # change the path of train.txt / valid.txt / test.txt text = text.read() # Convert text to word word_token = word_tokenize(text) # performe POS Tagging word_pos = pos_tag(word_token) # Define pattern for POS pattern = 'NP: {<DT>?<JJ>*<NN>}' #Tag BOI for POS cp = nltk.RegexpParser(pattern) cs = cp.parse(word_pos) #BOI Tagging on POS Tagging iob_tagged = tree2conlltags(cs) print(iob_tagged) NER_List = []
def adapted_lesk(word, sentence, context_window_size=3, pos=None): """Performs word sense disambiguation using the Adapted Lesk Algorithm, due to Banerjee and Pedersen. Arguments: *word* (str) -- the target word to be disambiguated \n *sentence* (str) -- the context in which the target word occurs \n *context_window_size* (int) -- the number of words from the left and right of the target word to be taken into analysis \n *pos* (str) -- the part of speech of the target word Returns: Synset type -- the WordNet sense of the disambiguated word """ # Tokenize input sentence, remove punctuation and stopwords sentence = utils.remove_stopwords(utils.remove_punctuation(w_tok(sentence))) # Perform lemmatization on sentence lemmatizer = WordNetLemmatizer() tagged_sentence = pos_tag(sentence) sentence = [lemmatizer.lemmatize(tup[0], utils.get_wordnet_pos(tup[1])) for tup in tagged_sentence] # Perform lemmatization on target word if pos == None: tagged_word = pos_tag([word]) word = lemmatizer.lemmatize(tagged_word[0][0], utils.get_wordnet_pos(tagged_word[0][1])) pos = utils.get_wordnet_pos(tagged_word[0][1]) else: word = lemmatizer.lemmatize(word, pos) # Extract the context window from the sentence if word in sentence: word_index = sentence.index(word) if word_index - context_window_size < 0: window_words = sentence[0 : word_index + context_window_size + 1] else: window_words = sentence[word_index - context_window_size : word_index + context_window_size + 1] # Take the Synsets of the target word senses = wn.synsets(word) best_sense = senses[0] best_score = 0 for sense in senses: if sense.pos() == pos: # Only take the current sense into account if it is the correct pos score = 0 for w in window_words: if w != word: w_senses = wn.synsets(w) for w_sense in w_senses: score += similarity(sense, w_sense, pos) if score > best_score: best_score = score best_sense = sense else: # If target word is not in context, after lemmatizing, return first wordnet sense f = open('logs/guessed.txt', 'a') line = "word: " + word + " in sentence: " + ' '.join(sentence) f.write(line + '\n') f.close() return wn.synsets(word)[0] return best_sense
""" Created on Tue Apr 16 10:33:57 2019 @author: [email protected] Information extraction https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da """ import nltk from nltk.tokenize import word_tokenize, sent_tokenize from nltk.tag import pos_tag ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices' sent = word_tokenize(ex) sent = pos_tag(sent) #we get tuples of words with their respective pos tags. But the funny tihng with pos tags is that #they change for the same words depending on the occurance of the word in a sentence. #Chunking to extract ners "Lets create a chunk NP for extracting noun phrases" pattern = "NP:{<DT>?<JJ>*<NN>}" #lets test it cp = nltk.RegexpParser(pattern) cs = cp.parse(sent) print(cs)
def extract_NNPs(text): tagged = pos_tag(text.split()) NNPs = [word for word, pos in tagged if pos == 'NNP'] return NNPs
def factSequenceAnalysis(path): tree = ET.parse(path) root = tree.getroot() root = tree.getroot() yRange = 0 #fil = open("output.txt",'w') result = [] f = 0 old_rev = [] count_rev = 0 total_facts = [] #fil = open("output.txt","w"); for rev in root.find( '{http://www.mediawiki.org/xml/export-0.10/}page').findall( '{http://www.mediawiki.org/xml/export-0.10/}revision'): count_rev += 1 ''' if(count_rev==3): break ''' text = rev.find('{http://www.mediawiki.org/xml/export-0.10/}text').text if (not text): total_facts.append(0) continue tags = ["NNP", "NNPS"] tagged_sent = pos_tag(word_tokenize(text)) #fil.write("TimeStamp:"+rev.find("{http://www.mediawiki.org/xml/export-0.10/}timestamp").text+"\n"); current_rev = [] #fil.write("TimeStamp:"+rev.find("{http://www.mediawiki.org/xml/export-0.10/}timestamp").text+"\n"); if (f == 0): yRev = [] count_y = 1 for tagged_word in tagged_sent: if hasNumbers(tagged_word[0]) == False and hasPunctuations( tagged_word[0]) == False and len( tagged_word[0] ) > 1: #to remove words like ",","132" etc. if (tagged_word[1] in tags): #fil.write(tagged_word[0]+" , ") old_rev.append(str(tagged_word[0])) yRev.append(count_y) count_y += 1 #fil.write("\n=====================================================================\n") #print(old_rev) total_facts.append(len(old_rev)) if (len(old_rev) > yRange): yRange = len(old_rev) result.append(yRev) f = 1 else: for tagged_word in tagged_sent: if hasNumbers(tagged_word[0]) == False and hasPunctuations( tagged_word[0]) == False and len( tagged_word[0] ) > 1: #to remove words like ",","132" etc. if (tagged_word[1] in tags): #fil.write(tagged_word[0]+" , ") current_rev.append(str(tagged_word[0])) new_list = find_diff(old_rev, current_rev) total_facts.append(len(current_rev)) #print(current_rev) yRev = new_list[0] if (len(current_rev) > yRange): yRange = len(current_rev) result.append(yRev) old_rev = current_rev #print("one revision completed!!") #fil.write("\n=====================================================================\n") result.append(total_facts) result.append(count_rev) return result
review += arr.strip() #adds it to var review arr = fileReview.read() fileReview.close() print "Sentence tokenization..." review_dict = sent_tokenize(review) #tokenizes sentences arr_pos = [] removed = [] print "POS tagging for words..." #arr_sent = pos_tag(review_dict) #tagging words for semantic #annotation for sent in review_dict: #adding individual sentences after tagging arr_pos.extend([pos_tag(sent.split())].__iter__()) ################################################################################ ################################################################################ print "Loading Parser..." #t = npc.parse(tmp_arr_pos[0]) print "Finished loading..." #print len(t) #t.draw() #help(t) sentCount = 1 sentScore = [] #tuple with (Subj-Obj , Verb-P , ) totalS = []
swlist = stopwords.words('english') stemmer = PorterStemmer() pos_corpus = [] neg_corpus = [] for f in pos: # match any special character and remove it (other than _) processed_content = re.sub(r'\W+', ' ', f.lower()) # text into tokens words = word_tokenize(processed_content) # Attaching part of speech to each word pos_words = pos_tag(words) #returns a list clean_words = [] for w in pos_words: if w[0] in swlist or len(w[0]) <= 3 or w[1] not in ( 'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'ADJ', 'ADV', 'VBN', 'VBG'): continue clean_words.append(stemmer.stem(w[0])) pos_content = ' '.join(clean_words) pos_corpus.append(pos_content) for f in neg: processed_content = re.sub(r'\W+', ' ', f.lower()) words = word_tokenize(processed_content) neg_words = pos_tag(words)
import nltk from nltk.tokenize import word_tokenize from nltk.tag import pos_tag from nltk.chunk import ne_chunk sentence = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices' ne_tree = ne_chunk(pos_tag(word_tokenize(sentence))) print(ne_tree) ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices' def preprocess(sent): sent = nltk.word_tokenize(sent) sent = nltk.pos_tag(sent) return sent sent = preprocess(ex) sent pattern = 'NP: {<DT>?<JJ>*<NN>}' import shanepy import shanepy as spy from shanepy import * cp = nltk.RegexpParser(pattern) cs = cp.parse(sent)
#Stemming from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer print("\n\n)") print("STEMMING") for WORD in WORDS: stemmer = PorterStemmer() print(WORD, stemmer.stem(WORD)) #POS from nltk.tokenize import word_tokenize from nltk.tag import pos_tag print("\n\n") print("POS-TAG") for WORD in WORDS: print(WORD, pos_tag(WORD)) #LEMMATIZATION from nltk.stem import WordNetLemmatizer print("\n\n") print("LEMMATIZATION") lemmatizer = WordNetLemmatizer() for WORD in WORDS: print(WORD, lemmatizer.lemmatize(WORD)) #TRIGRAM from nltk import word_tokenize from nltk.util import ngrams print("\n\n") print("TRIGRAM") input_list = []