def demo(): # split paragraph into sentences using punct sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') sents = sent_tokenizer.tokenize(paragraphs) # split sentence into tokens (wrods + puncts) s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." WordPunctTokenizer().tokenize(s) #['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] PunktWordTokenizer().tokenize(s) #['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] PunktWordTokenizer().span_tokenize(s) #[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] #split the paragraph into sentence nltk.sent_tokenize(s) #split sentence into word and punct nltk.word_tokenize(s) # pos tagging nltk.pos_tag(nltk.word_tokenize(s))
def __get_extra_wiki_description(mesh_text, wiki_text, tfidf): mesh_sents = sent_tokenize(mesh_text) wiki_sents = sent_tokenize(wiki_text) mesh_tfidf_list = __sentences_to_tfidf_vecs(mesh_sents, tfidf) wiki_tfidf_list = __sentences_to_tfidf_vecs(wiki_sents, tfidf) extra_description = '' for i, wiki_tfidf_vec in enumerate(wiki_tfidf_list): have_similar = False for j, mesh_tfidf_vec in enumerate(mesh_tfidf_list): sim_val = tfidf.sim(wiki_tfidf_vec, mesh_tfidf_vec) if sim_val > 0.95: # print sim_val, 'SIMILAR:' # print mesh_sents[j] # print wiki_sents[i] have_similar = True break if not have_similar: extra_description += ' ' + wiki_sents[i] if len(extra_description) > 1: extra_description = extra_description[1:] if extra_description[-1].isalpha(): extra_description += '.' elif extra_description[-1] == ':': extra_description = extra_description[:-1] + '.' return extra_description return ''
def postroot(): if 'text' in request.forms: text = request.forms['text'] sentences = sent_tokenize(text) result = " ".join(w+'/'+t for s in sent_tokenize(text) for (w,t) in pos_tag(word_tokenize(s))) else: text = 'Type your text here' result = '' return template(""" <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>My Part of Speech Tagger</title> </head> <body> <h1>My Part of Speech Tagger</h1> <p>Type or paste your text below</p> <form method="post"> <textarea name="text" rows="10" cols="50"> {{text}} </textarea> <input type="submit"/> </form> <hr> <p>The tagged text is</p> <p>{{tagged}} </body> </html> """, text=text, tagged=result)
def cosineReadable(sentences): #FIRST CHECK - we need at least 3 sentences for this method to be worth it if (len(nltk.sent_tokenize(sentences)) <= 2): return sentences else: #we have enough sentences to do a readability overhaul wordDimensions = [] #this gives every word an assigned dimension in the vector for sent in nltk.sent_tokenize(sentences): for word in nltk.word_tokenize(sent): if word not in wordDimensions: #no duplicates wordDimensions.append(word) sentlist = nltk.sent_tokenize(sentences) firstsent = sentlist[0] sentenceVectors = [] #this will be a list of sentVectors for every sent in summary for i in range(0,len(sentlist)): #turn every sentence into a vector vec = makeSentVector(sentlist[i], wordDimensions) sentenceVectors.append(vec) sentScores = {} #dic keeps track of cosine distance scores for the sentences (in comparison to the first sentence) firstSentVec = sentenceVectors[0] for x in range(1, len(sentlist)): sent = sentlist[x] val = spatial.distance.cosine(firstSentVec, sentenceVectors[x]) sentScores[sent] = val sentScores = sorted(sentScores, reverse=True, key=sentScores.get) summary = str(sentlist[0])+"\n" for otherSent in sentScores: summary+=str(otherSent).strip()+"\n" summary = summary.strip() return summary
def get_summaries_and_articles(coll): ''' INPUT: mongo collection object OUTPUT: list of summaries, list of articles Runs through the MongoDB and extracts all of the newser.com summaries with their corresponding articles. ''' summary_list = [] article_list = [] for doc in list(coll.find()): if doc['full_text'] != ' ': summary_list.append(doc['summary']) article_list.append(doc['full_text']) for i in xrange(len(article_list)): text = '' for article in article_list[i]: text += article article_list[i] = text summary_test = np.unique([summary_list[i] for i in xrange(len(summary_list)) if article_list[i] != '' and article_list[i] != ' ' and len(sent_tokenize(article_list[i])) > 10]) article_test = np.unique([article for article in article_list if article != '' and article_list[i] != ' ' and len(sent_tokenize(article)) > 10]) return summary_test, article_test
def print_summary(indexes, doc, extract_n, doc_index): if len(indexes) < extract_n: extract_n = len(indexes) reference = "reference/task" + str(doc_index) + "_englishReference" + str(doc_index) + ".txt" reference_output = io.open(reference, "w", encoding='utf8') tips = sent_tokenize(doc.tip) for tip in tips: reference_output.write(tip + "\n") reference_output.close() sentences = sent_tokenize(doc.review) #print "" ## print "sentences length: " + str(len(sentences)) #print "" #print "indexes: " + str(indexes) #print "" system = "system/task" + str(doc_index) + "_englishSyssum" + str(doc_index) + ".txt" system_output = io.open(system, "w", encoding='utf8') for i in range(0, extract_n): #print "index: " + str(indexes[i]) system_output.write(sentences[indexes[i]] + "\n") system_output.close()
def refineText(infp, outfp): stringlist = [] textline = "" size = "" for line in infp: current = line.strip().replace(' ',' ') if current.startswith("<size>"): if current != size and size != "": for sentence in nltk.sent_tokenize(''.join(stringlist)): for token in MyTokenizer().tokenize(sentence): token = token.replace("“", "") token = token.replace("”", "") outfp.write(token+" ") outfp.write('\n') stringlist = [] outfp.write('\n') stringlist.append(textline) size = current elif current == '': continue elif current[-1] == '-': textline = current[0:-1] else: textline = current+' ' for sentence in nltk.sent_tokenize(''.join(stringlist)): for token in MyTokenizer().tokenize(sentence): token = token.replace("“", "") token = token.replace("”", "") outfp.write(token+" ") outfp.write('\n')
def readD(txtdoc): #find basename import os, nltk base = os.path.basename(txtdoc) #read file with open (txtdoc,"r") as myfile: text = myfile.readlines() #extract relevant text from dataset #write document f = open(base + ".ready", "w") #counts loops a = 0 #for every line for line in text: if line.startswith("<bestanswer>"): cleansentence = line[12:-13].replace("
"," ").replace(";",".").replace("<br />
","").replace("
"," ").replace("...",".").replace("<"," ").replace("<.br />.","") #split line into sentences sentences = nltk.sent_tokenize(cleansentence) s = len(sentences) #write into document x=0 while x < (s-1): f.write(sentences[x] + "\n") a +=1 x+=1 f.write(sentences[s-1]) a +=1 print( (str(a)), end='\r') if line.startswith("<answer_item>"): cleansentence = line[13:-14].replace("
"," ").replace(";",".").replace("<br />
","").replace("
"," ").replace("...",".").replace("<"," ").replace("<.br />.","") #split line into sentences sentences = nltk.sent_tokenize(cleansentence) s = len(sentences) #write into document x=0 while x < (s-1): f.write(sentences[x] + "\n") a +=1 x+=1 f.write(sentences[s-1]) a +=1 print( (str(a)), end='\r') f.close
def print_instance(relations, finlist, is_train): arg1 = reduce(lambda x,y: x+y, [nltk.word_tokenize(s) for s in nltk.sent_tokenize(finlist[0])]) arg2 = reduce(lambda x,y: x+y, [nltk.word_tokenize(s) for s in nltk.sent_tokenize(finlist[1])]) if len(relations)>1: return #if is_train: for relation in relations: fw.write(json.dumps({'Arg1':arg1,'Arg2':arg2,'Sense':relation})+'\n')
def percentage_long_sent(text): long_sentence = 0 sentence_all = len(nltk.sent_tokenize(text)) sentence_list = nltk.sent_tokenize(text) for sentence in sentence_list: wordlist = nltk.word_tokenize(sentence) word_count = len(wordlist) if word_count >15: long_sentence += 1 return long_sentence/sentence_all
def create_summary(text): text = re.sub(r'\s\s+', ' ', text) sentences = nltk.sent_tokenize(text) if len(sentences) < 10: num = 3 else: num = 2 summarizer = SimpleSummarizer() return nltk.sent_tokenize(summarizer.summarize(text, num))
def featurize(): n = 100 # number of articles per topic employer = request.form['user_input'] ftopic = df[df['company']==employer].head(n) text = list(ftopic['pros'].values) text = " ".join(text) text = re.sub('[^\w\s]+', ' ', text).replace('\n', ' ') # tokenize into words tokens = [word.lower() for sent in sent_tokenize(text) \ for word in word_tokenize(sent)] # remove stopwords # some extra stop words not present in stopwords stop = stopwords.words('english') stop += ['said', 'would', 's', 'also', 'U', 'mr', 're', 'may', 'one', 'two', 'buy', 'much', \ 'take', 'might', 'say', 'new', 'year', 'many','etc', 'll', 've'] stop += str(employer) tokens = [token for token in tokens if token not in stop] # remove words less than three letters tokens = [word for word in tokens if len(word) >= 2] string = " ".join(tokens) wordcloud = WordCloud(font_path='/Library/Fonts/Arial Rounded Bold.ttf').generate(string) plt.figure(figsize=(50,30)) plt.imshow(wordcloud) plt.axis("off") name = 'static/' +str(employer) + '-pros.png' pic = plt.savefig(name, bbox_inches='tight',transparent = True) text2 = list(ftopic['cons'].values) text2 = " ".join(text2) text2 = re.sub('[^\w\s]+', ' ', text2).replace('\n', ' ') # tokenize into words tokens2 = [word.lower() for sent in sent_tokenize(text2) \ for word in word_tokenize(sent)] # remove stopwords # some extra stop words not present in stopwords stop2 = stopwords.words('english') stop2 += ['said', 'would', 's', 'also', 'U', 'mr', 're', 'may', 'one', 'two', 'buy', 'much', \ 'take', 'might', 'say', 'new', 'year', 'many','etc', 'll', 've'] stop2 += str(employer) tokens2 = [token for token in tokens2 if token not in stop2] # remove words less than three letters tokens2 = [word for word in tokens2 if len(word) >= 2] string2 = " ".join(tokens2) wordcloud2 = WordCloud(font_path='/Library/Fonts/Arial Rounded Bold.ttf').generate(string2) plt.figure(figsize=(50,30)) plt.imshow(wordcloud2) plt.axis("off") name2 = 'static/' +str(employer) + '-cons.png' pic2 = plt.savefig(name2, bbox_inches='tight',transparent = True) return render_template('template_wordcloud.html', pic_pro = name, pic_con=name2, employer=employer)
def main(): tagged = getTagged(corpusdir) featureSet = [(getFeatures(feature), tag) for (feature, tag) in tagged] trainSet = featureSet[:] testSet = featureSet[:100] classifier = nltk.NaiveBayesClassifier.train(trainSet) fileList = os.listdir(corpusdir) sentences = [] visited = [] for (stem, tag) in [(f[:-4], f[-3:]) for f in fileList]: if stem in visited: continue else: visited.append(stem) print stem f_pos, f_neg = open(corpusdir + "/" + stem + "_pos"), open(corpusdir + "/" + stem + "_neg") f_neg = open(corpusdir + "/" + stem + "_neg") raw_pos, raw_neg = f_pos.read(), f_neg.read() sent_pos, sent_neg = sent_tokenize(raw_pos), sent_tokenize(raw_neg) f_pos.close() f_neg.close() falseNeg = falsePos = trueNeg = truePos = 0 for sent in sent_pos: guess = classifier.classify(getFeatures(sent)) if guess == "POS": truePos +=1 else: falseNeg += 1 for sent in sent_neg: guess = classifier.classify(getFeatures(sent)) if guess == "NEG": trueNeg +=1 else: falsePos += 1 posTags = len(sent_pos) negTags = len(sent_neg) totTags = posTags + negTags #print "Total sentences: %i" % (totTag) #print "Total negative: %.2f%%" % (float(negTags) / totTag * Tag100) #print "Total positive: %.2f%%" % (float(posTags) / totTag * 100) #print "True negatives: %.2f%%" % (float(trueNeg) / negTags * 100) #print "True positives: %.2f%%" % (float(truePos) / posTags * 100) print "False negatives: %.2f%%" % (float(falseNeg) / posTags * 100) print "False positives: %.2f%%" % (float(falsePos) / negTags * 100) print "" print "Accuracy: %f" % nltk.classify.accuracy(classifier, testSet)
def _shuffle_text(self, text, times, label_func): from random import shuffle origin_sents = sent_tokenize(text) assert len(origin_sents) > 1 sents = sent_tokenize(text) res = [] for i in range(times): shuffle(sents) label = label_func(sents, origin_sents) res.append((' '.join(sents[:-1]), label)) return res
def content(self, title, text): """ Set title and text of the content needs to be parsed. """ self._title = title self._text = text self._sepText = text.split('\n') self._tokens = nltk.word_tokenize(self._text) # not using regex for tokenization self._textSents = nltk.sent_tokenize(self._text) self._textSents = list(map(lambda x: x.strip(), self._textSents)) # strip all sentences self._sepTextSents = [] for pp in self._sepText: self._sepTextSents.append(nltk.sent_tokenize(pp))
def get_crowdd500_data(set_type): """ Returns documents and keywords in either train or test sets of Crowd500 [Marujo2012] """ path = 'data/500N-KPCrowd-v1.1/CorpusAndCrowdsourcingAnnotations/' + set_type + '/' files = [f[:-4] for f in os.listdir(path) if re.search('\.key',f)] documents = [] all_keywords = [] if set_type=='test': documents = pickle.load(open(path + 'scraped_testdata.pkl','rb')) # scraped webpages in test set skip_these = [3,7,14,19,26,27,32,33,43,45] # these webpages no longer exist, cannot find source text for file_idx in xrange(len(files)): if set_type=='train': # original text f = open(path + files[file_idx] + '.txt','r') text = f.read() f.close() # encoding issues in Crowd500 try: text = text.encode('utf-8') sentences = nltk.sent_tokenize(text.lower()) except: text = text.decode('utf-8') sentences = nltk.sent_tokenize(text.lower()) documents.append(text) # keywords keywords = [] with open(path + files[file_idx] + '.key','r') as f: for line in f: keywords.append(line.strip('\n')) keywords = [remove_punctuation(k.lower()) for k in keywords] all_keywords.append(keywords) else: if file_idx not in skip_these: keywords = [] with open(path + files[file_idx] + '.key','r') as f: for line in f: keywords.append(line.strip('\n')) keywords = [remove_punctuation(k.lower()) for k in keywords] all_keywords.append(keywords) return {'documents':documents, 'keywords':all_keywords}
def parse_and_tag(corpus): boring_tags=['CC','DT',',','IN','PRP','PRP$','VBZ','TO','POS',':','(',')','AT','.',"''"] if isinstance(corpus,basestring): with lorecorpus.open(corpus) as fin: sents=nltk.sent_tokenize(fin.read().strip()) else: sents=nltk.sent_tokenize(corpus.raw().strip()) tagged_text_unmerged=([nltk.pos_tag(nltk.word_tokenize(sent)) for sent in sents]) #make a list of tuples, not a list of lists of tuples tagged_text=[item for sublist in tagged_text_unmerged for item in sublist] all_word_tuples=[(word[0].lower(), word[1]) for word in tagged_text if word[1] not in boring_tags] #turn the tuples into lists all_word_lists=[list(word) for word in all_word_tuples] all_words=[a for (a,b) in all_word_tuples] return all_word_lists
def extractNames(li): finList = [] ## Loop through the list that has the HTML page content for a in li: ## Tokenize the HTML text into smaller blocks of text for send in nltk.sent_tokenize(str(a)): smLi = [] ## Tokenize the smaller blocks of text in individual words and then add a Part-of-Speech(POS) tag for index, chunk in enumerate(nltk.pos_tag(nltk.word_tokenize(send))): ## If the POS tag is NNP (noun) if 'NNP' in chunk[1]: ## If the each character in the word is an alphanumeric character and there are more than 2 characters in the word if(len(' '.join(e for e in chunk[0] if e.isalnum())) > 2): ## Append the list with the index of the word, chunk that has the POS tag and the link smLi.append([index, chunk, a[1]]) finList.append(smLi) nameLi = [] for f in finList: if len(f) > 0: strName = '' for index, i in enumerate(f): ## If strName is blank, declare it with the current word in the list if strName == '': strName = i[1][0] ## If index+1 is not at the end of the list, continue if (index + 1) < len(f): ## If the index is a consecutive index, add to the strName if i[0] + 1 == f[index + 1][0]: strName = strName + ' ' + f[index + 1][1][0] ## If the index is not a consecutive index, append strName to the nameLi list with the article link and make the strName blank else: if ' ' in strName: nameLi.append([strName, i[2]]) strName = '' return nameLi
def editText(filename): global subject subject = get_subj(filename) f = open(filename, 'r') text = f.read() f.close() new_file_name = subject + 'B.txt' g = open(new_file_name, 'w') sents = sent_tokenize(text) for i in range(len(sents)): sents[i] = sents[i].replace(' he ', ' '+subject+' ') sents[i] = sents[i].replace(' he.', ' '+subject+'.') sents[i] = sents[i].replace(' he,', ' '+subject+',') sents[i] = sents[i].replace('He ', subject+' ') sents[i] = sents[i].replace(' him ', ' '+subject+' ') sents[i] = sents[i].replace(' him.', ' '+subject+'.') sents[i] = sents[i].replace(' him,', ' '+subject+',') sents[i] = sents[i].replace(' himself ', ' '+subject+' ') sents[i] = sents[i].replace(' himself.', ' '+subject+'.') sents[i] = sents[i].replace(' himself,', ' '+subject+',') sents[i] = sents[i].replace(' his ', ' '+subject+" ") sents[i] = sents[i].replace(' his.', ' '+subject+".") sents[i] = sents[i].replace(' his,', ' '+subject+",") sents[i] = sents[i].replace('His ', subject) sents[i] = simplify_sent(sents[i]) g.write(sents[i] + ' ') g.close() return new_file_name
def tokenize_only(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [word for sent in nltk.sent_tokenize( ' '.join(re.findall(r'[\w]+', text, re.UNICODE)) ,language='spanish') for word in nltk.word_tokenize(sent,language='spanish')] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.match('[a-zA-Z]', token): filtered_tokens.append(token.lower()) return filtered_tokens #strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text # def strip_proppers_POS(text): # tagged = nltk.tag.pos_tag(text.split()) #use NLTK's part of speech tagger # non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS'] # return non_propernouns # def wordFrequency(tokens, stopwords): # dictFreq = {} # for token in tokens: # if not token in stopwords: # dictFreq[token] = tokens.count(token) # ans = sorted(dictFreq, key=dictFreq.__getitem__, reverse=True) # ipdb.set_trace() # return
def beginnerText(txt): sentences = sent_tokenize(txt) for sentence in sentences: sent = word_tokenize(sentence) sent = [word for word in sent if word.isalpha() and word not in ['amp']] speak(' '.join(sent)) time.sleep(0.35)
def getThreads(subreddit,num_comments=10,max_threads=5000,max_comments=100,min_comments=10,verbose=False): comment_counter = 0 already_done = [] #keep track of threads you've already seen (you can get them twice) subred = r.get_subreddit(subreddit) #get a subreddit comments = [] questionComment = [] for sub in subred.get_hot(limit=max_threads): if sub.id not in already_done and comment_counter < num_comments: already_done.append(sub.id) sub.replace_more_comments(limit=None, threshold=1) flat_comments = praw.helpers.flatten_tree(sub.comments) for comment in flat_comments: diff_comment = True for sentence in sent_tokenize(comment.body.encode('utf-8')): if '[deleted]' in sentence: break comments.append(sentence) if '?' in sentence and not diff_comment: s = {} s['Request'] = comments[-2]+' '+sentence s['id'] = comment.id s['score'] = comment.score questionComment.append(s) comment_counter += 1 print 'Added question. Comment counter',comment_counter diff_comment = False if comment_counter>num_comments: return [comments,questionComment] return [comments,questionComment]
def genTokens(self): text = self.ensureText() # set off w/space any entities that are butted up to preceding data text = re.sub(r'(?<!\s)(?P<entityref>%s)' % Tokenizer.entityRE, ' \g<entityref>', text) # set off w/space any entities that are butted up to following data text = re.sub(r'(?P<entityref>%s)(?!\s)' % Tokenizer.entityRE, '\g<entityref> ', text) for (entities, segment) in self.genSegments(text): # print "SEGMENT: [%s %r]" % (entities,segment) segment = segment.strip() if entities: for entity in re.split(r'\s+',segment): # print " ENTITY: [%s]" % entity; yield entity else: sentences = nltk.sent_tokenize(segment) # correct for any embedded newlines (irrelevant?) sentences = [re.sub(r'[\n\t]+', ' ', sent).strip() for sent in sentences] # inexplicably, NLTK thinks big,red should be a single token sentences = [re.sub(r'\b,\b', ', ', sent) for sent in sentences] for sentence in sentences: # print " SENTENCE: [%s]" % sentence for tok in nltk.word_tokenize(sentence): # print " TOK: [%s]" % tok yield tok
def num_words(filename): lengths = [] with open(filename) as f: lines = f.read() for l in nltk.sent_tokenize(lines): lengths.append(len(nltk.word_tokenize(l))) return (sum(lengths) - 0.0) / len(lengths)
def process_query(query_str): """ Tokenize and stem the query words and compute the frequency of each word in the query list Arguments: query_str string of query words Returns: query_count a dictionary with the stemmed words and the its frequency in the query """ query_list = [] sentences = nltk.sent_tokenize(query_str) for sentence in sentences: words = nltk.word_tokenize(sentence) for word in words: normalized = text_processing.normalize(word) if normalized is not None: query_list.append(normalized) # count the frequency of each term query_count = Counter(query_list) # set the tf value for each term query_weight = {} for query_term, term_count in query_count.items(): query_weight[query_term] = 1 + math.log10(term_count) return query_weight
def get_xmen_text(soup): #en_stopwords = set(nltk.corpus.stopwords.words('english')) raw = nltk.clean_html(str(soup)) raw_trunc = raw[:raw.rfind('References')] sents = nltk.sent_tokenize(raw_trunc) words = [nltk.word_tokenize(sent) for sent in sents] poss = [nltk.pos_tag(word) for word in words] #nes = [nltk.ne_chunk(pos, binary=True) for pos in poss] #for pos in poss: print pos poss_filter = [filter_insignificant(pos, tag_suffixes=['DT']) for pos in poss] print poss_filter nes = [nltk.ne_chunk(poss_filter, binary=True) for pos in poss_filter] def sub_leaves(tree, node): return [t.leaves() for t in tree.subtrees (lambda s: s.node == node)] people = [sub_leaves(ne, 'NE') for ne in nes] people = [item for sublist in people for subsublist in sublist for subsubsublist in subsublist for item in subsubsublist if item not in ('NNP', 'NN', 'NNPS', 'JJ')] people = merge_people(people) fd = nltk.FreqDist(person for person in people if person!='Magneto') fd.plot(50)
def capitalize(self, text): """ Capitalizes whole text. """ return " ".join( self.capitalize_sentence(sent) for sent in sent_tokenize(text) )
def poem_tokenizer(poem): lines = open(poem).readlines() tokens = [] for l in lines: m = re.sub(r'[^0-9a-zA-Z\s\']', '', l) # cleans annoying punctuation, but keep apostrophes tokens.append([word for sent in nltk.sent_tokenize(m) for word in nltk.word_tokenize(sent)]) return tokens
def test3(): import nltk from nltk.corpus import conll2000 from urllib import urlopen fname = 'data/dummy/webpages/Abby_Watkins/raw/002/index.html' doc = urlopen(fname).read() raw = nltk.clean_html(doc) decoded = raw.decode('utf-8', errors='ignore') raw = decoded.encode('utf-8') print raw sentences = nltk.sent_tokenize(raw) sentences = [s.replace('\n', '').replace('\r', '').strip() for s in sentences] sentences = [nltk.word_tokenize(s) for s in sentences] sentences = [nltk.pos_tag(s) for s in sentences] #porter = nltk.PorterStemmer() #sentences = [[(porter.stem(w[0]), w[1]) for w in s] for s in sentences] #sentences = [[w[0] for w in s] for s in sentences] #sentences = [['%s_%s' % w for w in s] for s in sentences] lexicon = [] #for s in sentences: #print len(s) #for w in s: # print w[0] #print ' '.join(w[0] for w in s) #print nltk.ne_chunk(s, binary=True) #lexicon.extend(s) fdist = nltk.FreqDist(lexicon)
def preprocess_email(filename, puncts, stemmer): fin = open(filename, 'rb') text = fin.read() fin.close() # lowercase text = text.lower() # strip all HTML text = re.sub("<[^<>]+>", "", text) # Handle numbers text = re.sub("[0-9]+", "number", text) # Handle URLs text = re.sub("(http|https)://[^\s]*", "httpaddr", text) # Handle email addresses text = re.sub("[^\s]+@[^\s]+", "emailaddr", text) # Handle $ sign text = re.sub("[$]+", "dollar", text) for sentence in nltk.sent_tokenize(text): for word in nltk.word_tokenize(sentence): # get rid of punctuation if word in puncts: continue # remove non-alpha chars word = re.sub("[^a-zA-Z0-9]", "", word) # stem the word word = stemmer.stem(word) # skip word if too short (currently a NOOP) if len(word) < 1: continue yield word
import nltk from nltk import word_tokenize from nltk import sent_tokenize from nltk.corpus import stopwords from nltk.corpus import wordnet #wordnet for using sysnsets text = input() sent = sent_tokenize(text) for i in sent: token = word_tokenize(i) print(token) stop = set(stopwords.words("english")) filteredSentence = [] for w in token: if w not in stop: filteredSentence.append(w) print("filtered sentence: ", filteredSentence) #create a database of all the symptoms and compare it with the tokens and stemmed tokens #if comparision % is more than 80, consider them else dont #ask use to verify the symptoms #if user adds something, filter the sentence and compare it from the database #if match is >80%, add ot otherwise google it and add symptoms
def DNN_lm(): """ http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial -part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano/ """ vocabulary_size = 26 unknown_token = "UNKNOWN_TOKEN" sentence_start_token = "SENTENCE_START" sentence_end_token = "SENTENCE_END" path = os.path.dirname(os.path.realpath(__file__)) #print("Reading CSV file...") with open(path + '/data/reddit-comments-2015-08.csv', 'r') as f: reader = csv.reader(f, skipinitialspace=True) next(reader) # Split full comments into sentences sentences = itertools.chain( *[nltk.sent_tokenize(x[0].lower()) for x in reader]) # Append SENTENCE_START and SENTENCE_END sentences = [ "%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences ] #print("Parsed %d sentences." % (len(sentences))) # Tokenize the sentences into words tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] # Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) #print("Found %d unique words tokens." % len(word_freq.items())) # Get the most common words and build index_to_word # and word_to_index vectors vocab = word_freq.most_common(vocabulary_size - 1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)]) #print("Using vocabulary size %d." % vocabulary_size) #print("The least frequent word in our vocabulary is '%s' \ # and appeared %d times." % (vocab[-1][0], vocab[-1][1])) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [ w if w in word_to_index else unknown_token for w in sent ] print("\nExample sentence: '%s'" % sentences[0]) print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]) # Create the training data X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) #print(X_train) #print(y_train) def gen_input(x): sentence = x d = np.zeros((len(sentence), vocabulary_size)) d[np.arange(len(sentence)), sentence] = 1 return d l1 = Layer(100, Tansig()) l2 = Layer(vocabulary_size, Softmax()) #l2 = Layer(vocabulary_size, Logsig()) net = Network((l1, l2)) net.connect(1, 2, D=0) net.connect(1, 1, D=1) net.load(vocabulary_size, 1, D=0) sentence10 = gen_input(X_train[10]) o = net.forward(sentence10) print(o) training_input = [gen_input(x) for x in X_train[:2]] training_output = [gen_input(x) for x in y_train[:2]] net.train(training_input, training_output) o = net.forward(sentence10) print(o)
def convert(files_raw_data): files_raw_data = files_raw_data.replace("\n", " ") files_raw_data = files_raw_data.lower() sentences = sent_tokenize(files_raw_data) return sentences
def process_corpus(corpus_name): print("Corpus to examine: " + corpus_name) input_file = corpus_name + ".zip" corpus_contents = unzip_corpus(input_file) sentences = [] words = [] pos_results = open(corpus_name + "-pos.txt", 'w+') cur_sentence = [] all_pos = [] for entry in corpus_contents: sentences.append(nltk.sent_tokenize(entry)) for story in sentences: for sent in story: word_sent = nltk.word_tokenize(sent) words.extend(word_sent) cur_sentence = nltk.pos_tag(word_sent) all_pos.extend(cur_sentence) for pair in cur_sentence: pos_results.write(pair[0] + "/" + pair[1]) pos_results.write('\n') pos_results.write('\n') print("Number of words: " + str(len(words))) i = 0 for word in words: words[i] = word.casefold() i += 1 print("The vocabulary size is: " + str(len(set(words)))) most_common = nltk.FreqDist(pos for (word, pos) in all_pos) freq_list = most_common.most_common() print("The most common part of speech is " + str(freq_list[0][0]) + " which occurs " + str(freq_list[0][1]) + " times.") print("") word_dist = nltk.FreqDist(word for word in words) word_freq = word_dist.most_common() freq_results = open(corpus_name + "-word-freq.txt", 'w+') for pair in word_freq: freq_results.write(str(pair)) freq_results.write('\n') chart_freq = nltk.ConditionalFreqDist( (word.casefold(), tag) for (word, tag) in all_pos) con_freq = nltk.ConditionalFreqDist( (tag, word.casefold()) for (word, tag) in all_pos) copy = sys.stdout sys.stdout = open(corpus_name + "-pos-word-freq.txt", 'w+') chart_freq.tabulate() sys.stdout = copy common_words_by_pos = [ con_freq['NN'].most_common()[0], con_freq['VBD'].most_common()[0], con_freq['JJ'].most_common()[0], con_freq['RB'].most_common()[0] ] text_words = nltk.Text(words) print("The most common Noun is " + common_words_by_pos[0][0] + ". Similar words include:") text_words.similar(common_words_by_pos[0][0]) print("The most common Past Tense Verb is " + common_words_by_pos[1][0] + ". Similar words include:") text_words.similar(common_words_by_pos[1][0]) print("The most common Adjective is " + common_words_by_pos[2][0] + ". Similar words include:") text_words.similar(common_words_by_pos[2][0]) print("The most common Adverb is " + common_words_by_pos[3][0] + ". Similar words include:") text_words.similar(common_words_by_pos[3][0]) print("") print("The found collocations are:") text_words.collocations() pass
def train_splitter(data_generator): while True: # get the batch triplet query, pos_docs, neg_docs = next(data_generator) # tokenization query = tokenizer.texts_to_sequences(query) if queries_sw is not None: for tokenized_query in query: tokenized_query = [token for token in tokenized_query if token not in queries_sw] new_pos_docs = [] new_neg_docs = [] new_pos_extra_features = [] new_neg_extra_features = [] # sentence splitting if mode==4: for b in range(len(pos_docs)): new_pos_docs.append([]) new_neg_docs.append([]) _temp_pos_docs = nltk.sent_tokenize(pos_docs[b]["text"]) _temp_pos_docs = tokenizer.texts_to_sequences(_temp_pos_docs) if docs_sw is not None: for tokenized_docs in _temp_pos_docs: tokenized_docs = [token for token in tokenized_docs if token not in docs_sw] # skip batch with empty pos_docs if all([ len(sentence)==0 for sentence in _temp_pos_docs]): break # try a new resampling, NOTE THIS IS A EASY FIX PLS REDO THIS!!!!!!! # for obvious reasons _temp_neg_docs = nltk.sent_tokenize(neg_docs[b]["text"]) _temp_neg_docs = tokenizer.texts_to_sequences(_temp_neg_docs) if docs_sw is not None: for tokenized_docs in _temp_neg_docs: tokenized_docs = [token for token in tokenized_docs if token not in docs_sw] # compute extra features #extra_features_pos_doc = compute_extra_features(query[b], _temp_pos_docs, idf_from_id_token) #extra_features_neg_doc = compute_extra_features(query[b], _temp_neg_docs, idf_from_id_token) # add the bm25 score #extra_features_pos_doc.append(pos_docs[b]["score"]) #extra_features_neg_doc.append(neg_docs[b]["score"]) # add all the extra features #new_pos_extra_features.append(extra_features_pos_doc) #new_neg_extra_features.append(extra_features_neg_doc) # split by exact matching for t_q in query[b]: # entry for the query-term new_pos_docs[-1].append([]) new_neg_docs[-1].append([]) for pos_sent in _temp_pos_docs: # exact math for the pos_document for i,t_pd in enumerate(pos_sent): if t_pd==t_q: new_pos_docs[-1][-1].append(pos_sent) break for neg_sent in _temp_neg_docs: for i,t_nd in enumerate(neg_sent): if t_nd==t_q: new_neg_docs[-1][-1].append(neg_sent) break else: raise NotImplementedError("Missing implmentation for mode "+str(mode)) if len(new_pos_docs) == len(pos_docs): # if batch is correct yield query, new_pos_docs, new_pos_extra_features, new_neg_docs, new_neg_extra_features
raw_document_text = 'Federer is married to former Women\'s Tennis Association '\ 'player Mirka Vavrinec. He met her while both were competing for Switzerland in'\ 'the 2000 Sydney Olympics. Couple of years later Vavrinec retired from the tour because of a'\ 'foot injury.[35] They were married at Wenkenhof Villa in Riehen near Basel on'\ '11 April 2009, surrounded by a small group of close friends and family.[36]'\ 'In July 2009, Mirka gave birth to identical twin girls, Myla Rose and Charlene'\ ' Riva.[37] The Federers had another set of twins in 2014, this time boys whom'\ ' they named Leo and Lennart,[38] called Lenny.[39]' doc = Document.Document('RFWiki', '2016\/11\/22', raw_document_text) json_str = json.dumps(doc, default=lambda o: o.__dict__) tokens = nltk.sent_tokenize(doc.raw_text) sentences = [] classes = ['money', 'percent', 'date', 'time'] index = 0 for sent in tokens: sent_object = Document.Sentence(sent, index, doc.document_id) index = index + 1 tags1 = NERTagger1.tag(sent_object.raw_sentence.split()) for i, t in enumerate(tags1): sent_object.words[GetASCIIString(t[0]) + '_' + str(i)].ne_tag = GetASCIIString(t[1]) tags2 = NERTagger2.tag(sent_object.raw_sentence.split()) for i, t in enumerate(tags2):
def allowed(word): return len(word) > 1 and set(word) <= accepted_chars # load corpus into RAM print('Loading corpus') #txt = open(CorpusFName, 'rU').read() txt = codecs.open(CorpusFName, encoding='utf-8').read() # select word tokenizer #tokenizer = TreebankWordTokenizerNoContract() # split text into sentences print('Splitting to sentences') sents = nltk.sent_tokenize(txt, language=lang) # init count Counts = defaultdict(int) lines = 0 nlines = len(sents) for s in sents: tokens = nltk.word_tokenize(s) # drop tokens that are punctuation symbols only keep = [] for t in tokens: if not set(t) <= punctiation: keep.append(t.lower()) if verbose:
import nltk from nltk.stem import WordNetLemmatizer nltk.download('popular', quiet=True) # for downloading packages # uncomment the following only the first time #nltk.download('punkt') # first-time use only #nltk.download('wordnet') # first-time use only #Reading in the corpus with open('chatbot.txt', 'r', encoding='utf8', errors='ignore') as fin: raw = fin.read().lower() #TOkenisation sent_tokens = nltk.sent_tokenize(raw) # converts to list of sentences word_tokens = nltk.word_tokenize(raw) # converts to list of words # Preprocessing lemmer = WordNetLemmatizer() def LemTokens(tokens): return [lemmer.lemmatize(token) for token in tokens] remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation) def LemNormalize(text): return LemTokens(
def make_dictionaries(file_dir, m=2): from re import sub, findall from nltk import sent_tokenize import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer # read in input file try: with open(file_dir, 'r') as f: text = f.read() except: print( "Unable to open input .txt file. Rerun with correct path to input .txt." ) return # ---------------- data cleaning ----------------- # split into sentences (one per item in series) df = pd.Series(sent_tokenize(text)) del text # lowercase everything df = df.str.lower() # remove stray apostrophes and parenthesis df = df.apply(lambda x: sub(r"((?<=\s)'|'(?!\w))", '', x)) df = df.apply(lambda x: sub('\"', '', x)) df = df.apply(lambda x: sub(r"[\(\)\[\]]", '', x)) df = df.apply(lambda x: sub(r"(?<=[a-zA-Z]),", ' commaplaceholder ', x)) def sub_endline(x): endline = findall(r"\W*$", x)[0] if '\!' in endline: return sub(r"\W*$", ' eendline', x) elif '\?' in endline: return sub(r"\W*$", ' qendline', x) else: return sub(r"\W*$", ' pendline', x) df = df.apply(sub_endline) # ---------- create data structures ------------------ data_structs = [None] * (m + 1) # probabilities for the starting word data_structs[0] = df.str.split(n=1).str[0].value_counts() data_structs[0] = data_structs[0] / data_structs[0].sum() # distributions for subsequent words for i in range(1, m + 1): vect = CountVectorizer(token_pattern=r"(?u)\b[^\s]+\b", analyzer='word', ngram_range=(i + 1, i + 1)) vect.fit(df) # get occurrences out of vect pairs = pd.Series(np.asarray( vect.transform(df).sum(axis=0)).reshape(-1), index=vect.get_feature_names(), name='freq') pairs.index.name = 'tokens' pairs = pairs.reset_index() # expand to 2 columns (prompt, response) pairs = pd.concat([ pairs['tokens'].str.rsplit(n=1, expand=True).rename(columns={ 0: 'prompt', 1: 'response' }), pairs['freq'] ], axis=1) # undo endline/comma substitutions pairs['prompt'] = pairs['prompt'].apply( lambda x: sub(r"\s*commaplaceholder", ',', x)) pairs['response'] = pairs['response'].apply( lambda x: sub(r"\s*commaplaceholder", ',', x)).replace( 'pendline', '.').replace('qendline', '?').replace('eendline', '!') # store results in a dictionary doubles = {} for token, group in pairs.groupby('prompt'): doubles[token] = { 'prob': (group['freq'] / group['freq'].sum()).values, 'token': group['response'].values } data_structs[i] = doubles # export data import pickle with open('data.pkl', 'wb') as f: pickle.dump(data_structs, f) f.close()
# Corpus einlesen # with open('chatbot_de.txt','r', encoding='utf8', errors ='ignore') as bockwurst: # raw = bockwurst.read().lower() with open("new.txt", 'r', encoding='utf8', errors='ignore') as tweet1file: raw = tweet1file.read().lower() with open(os.path.join("json", "trump_data_file.txt"), 'r', encoding='utf8', errors='ignore') as tweet2file: raw = tweet2file.read().lower() # Tokenisierung # sent_tokens konvertiert in Liste von Sätzen sent_tokens = nltk.sent_tokenize(raw) # word_tokens konvertiert in Liste von Worten (Wird nicht verwendet.) word_tokens = nltk.word_tokenize(raw) # Vorverarbeitung (Preprocessing) lemmer = WordNetLemmatizer() def LemTokens(tokens): return [lemmer.lemmatize(token) for token in tokens] remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation) def LemNormalize(text):
"zipcode", "city", "neighbourhood_cleansed", "market", "smart_location", "room_type", # =Private room "bedrooms", # =1 "beds", # =1 "price" ] df = pd.read_csv(file_object, usecols=lambda x: x in VIBE_FIELDS) sentences = [] for document in df['neighborhood_overview'][1:100]: try: sentences.extend(nltk.sent_tokenize(document)) except: pass file = open('sentence2label.csv', "w") sentence2label = [] for sentence in sentences: print sentence try: labels = raw_input("Enter labels. ex. green,diverse :") except ValueError: labels = None file.write(sentence + "," + labels + "\n") #sentence2label = [(sentence,labels)] #print sentence2labels #store['airbnb_vibes_raw'] = df
import nltk from nltk.corpus import reuters sentences = nltk.sent_tokenize(reuters.raw('test/21131')[:1000]) print("#sentences={0}\n\n".format(len(sentences))) for sent in sentences: print(sent, '\n')
def ie_preprocess(document): document = ' '.join([i for i in document.split() if i not in stop]) sentences = nltk.sent_tokenize(document) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] return sentences
def ie_preprocess(document): sentences = nltk.sent_tokenize(document) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences]
nif=rdflib.Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#") data=sys.argv[1] for arg in sys.argv: lang = arg count=0 for filename in os.listdir('Files/Input'+lang+'/'): if (count < int(data)): #print(filename) graph2=rdflib.Graph() graph2.parse('Files/Input'+lang+'/'+filename,format='nt') g=Graph() name=filename.split(".")[0] s=graph2.serialize(format="nt") for s,p,o in graph2: if type(o)==rdflib.term.Literal and nif.isString in p: sentences = nltk.sent_tokenize(o) for i in sentences: try: BI=o.index(i) EI=o.index(i)+len(i) g.add([rdflib.term.URIRef("http://dbpedia.org/resource/"+name+"?dbpv=2016-10&nif=sentence_"+str(BI)+"_"+str(EI)),RDF.type,nif.Sentence]) g.add([rdflib.term.URIRef("http://dbpedia.org/resource/"+name+"?dbpv=2016-10&nif=sentence_"+str(BI)+"_"+str(EI)),nif.beginIndex,rdflib.term.Literal(str(BI))]) g.add([rdflib.term.URIRef("http://dbpedia.org/resource/"+name+"?dbpv=2016-10&nif=sentence_"+str(BI)+"_"+str(EI)),nif.endIndex,rdflib.term.Literal(str(EI))]) g.add([rdflib.term.URIRef("http://dbpedia.org/resource/"+name+"?dbpv=2016-10&nif=sentence_"+str(BI)+"_"+str(EI)),nif.anchorOf,rdflib.term.Literal(i)]) g.add([rdflib.term.URIRef("http://dbpedia.org/resource/"+name+"?dbpv=2016-10&nif=sentence_"+str(BI)+"_"+str(EI)),nif.referenceContext,rdflib.term.URIRef("http://dbpedia.org/resource/"+name+"?dbpv=2016-10&nif=context")]) except: pass g.bind("nif",nif) #print(g.serialize(format="turtle")) g.serialize(destination='Files/Sentence/'+filename,format="turtle") count=count+1
history. Our production needed to move to the southern tip of this planet just to be able to find snow. Climate change is real, it is happening right now. It is the most urgent threat facing our entire species, and we need to work collectively together and stop procrastinating. We need to support leaders around the world who do not speak for the big polluters, but who speak for all of humanity, for the indigenous people of the world, for the billions and billions of underprivileged people out there who would be most affected by this. For our children’s children, and for those people out there whose voices have been drowned out by the politics of greed. I thank you all for this amazing award tonight. Let us not take this planet for granted. I do not take tonight for granted. Thank you so very much.""" dataSet = nltk.sent_tokenize(paragraph) for i in range(len(dataSet)): dataSet[i] = dataSet[i].lower() dataSet[i] = re.sub(r'\W', ' ', dataSet[i]) dataSet[i] = re.sub(r'\s+', ' ', dataSet[i]) word2count = {} for data in dataSet: words = nltk.word_tokenize(data) for word in words: if word not in word2count.keys(): word2count[word] = 1 else: word2count[word] += 1
# CHUNKGRAMS import nltk from nltk import word_tokenize, pos_tag, ne_chunk text = """ Apple's iPhone revenue for the holiday quarter fell 15% from the same period a year ago, the company said after the markets closed Tuesday. CEO Tim Cook blamed sales decline on a mix of factors, including a slowdown in China, foreign exchange rates, a popular battery replacement program and reduced smartphone subsidies from carriers. """ keywords = set() sentences = nltk.sent_tokenize(text) try: for i in sentences: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) # print (tagged) # chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" # use pipeline "|" to use more pattern # use plus "+" to add more tags to the pattern chunkGram = r"""NE: {<NNP>+<NNP>?|<NNP|NN>+<CC.*|NN.*>+<NNP>} {<NNP>}""" chunkParser = nltk.RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) NE = [ " ".join(w for w, t in ele) for ele in chunked if isinstance(ele, nltk.Tree) ] for i in NE: keywords.add(i) except Exception as e:
def get_data(): url = "http://letsrant.azurewebsites.net/api/values" reader = codecs.getreader("utf-8") obj = json.load(reader(urlopen(url))) # Data assigning a = [] b = [] place_nnp = 'None' issue = 'none' for i in range(0,len(obj)): data = obj[i]['Tweet'] # Tokenizer tokenizer = RegexpTokenizer(r'\w+') stopWords = set(stopwords.words('english')) words = tokenizer.tokenize(data) wordsFiltered = [] for w in words: if w not in stopWords: wordsFiltered.append(w) tagged = nltk.pos_tag(wordsFiltered) # Sentiment analysis # def sent(): def word_feats(words_): return dict([(wor, True) for wor in words_]) positive_vocab = [ 'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)' ] negative_vocab = [ 'bad', 'terrible','useless', 'hate', ':(' ] neutral_vocab = [ 'movie','the','sound','was','is','actors','did','know','words','not' ] positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab] negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab] neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab] train_set = negative_features + positive_features + neutral_features classifier = NaiveBayesClassifier.train(train_set) neg = 0 pos = 0 sentence = obj[i]['Tweet'] sentence = sentence.lower() words_ = sentence.split(' ') for wor in words_: classResult = classifier.classify( word_feats(wor)) if classResult == 'neg': neg = neg + 1 if classResult == 'pos': pos = pos + 1 pos = str(float(pos)/len(words_)) neg = str(float(neg)/len(words_)) # coordinates assigning coordinates = obj[i]['coordinates'] # Noun extraction sentences = nltk.sent_tokenize(data) d = [] for sent in sentences: d = d + nltk.pos_tag(nltk.word_tokenize(sent)) for word in d: if 'NNS' in word[1]: issue = word # if 'NNS' not in word[1]: # issue = 'null' if 'NNP' in word[1]: place_nnp = word # if 'NNP' not in word[1]: # issue = 'null' tweetid = obj[i]['TweetID'] place = obj[i]['PlaceName'] if place not in obj[i]: placename = place_nnp else: placename = place # JSON return a = {'tweetid':tweetid, 'place':placename, 'issue':issue, 'sentpos':pos, 'sentneg':neg, 'coordinates':coordinates} # print (a) b.append(a) return b
def sentence_tokenize(text): """tokenize text into sentences after simple normalization""" return sent_tokenize(prenormalize(text))
def prepare_text(input): sentences = nltk.sent_tokenize(input) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] sentences = [NPChunker.parse(sent) for sent in sentences] return sentences
from timex import tag from timex import ground import datetime from datetime import date import nltk from nltk.tokenize import sent_tokenize from nltk.tag import pos_tag import sys reload(sys) sys.setdefaultencoding('utf-8') # read and store diary content as nltk text summary = open('d47sum.txt', 'r') rawSum = summary.read() rawSum = rawSum.encode('ascii', errors='ignore') sentences = nltk.sent_tokenize(rawSum) # tokenize and tag content tokens = [nltk.word_tokenize(sent) for sent in sentences] tagged = [nltk.pos_tag(sent) for sent in tokens] chunked = nltk.ne_chunk_sents(tagged, binary=True) # extract named entities from content # returns an nltk.tree.Tree object which needs to be traversed # the Tree is a list, chunks are subtrees, and non chunked words are regular strings def extract_entity_names(t): entity_names = [] if hasattr(t, 'label') and t.label: if t.label() == 'NE' or t.label() == 'NP':
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import nltk from nltk import WordNetLemmatizer host = '127.0.0.1' port = 1232 warnings.filterwarnings('ignore') nltk.download('popular', quiet=True) # buka data librari atau biasa disebut raw data with open('data/bot.txt', 'r', encoding='utf8', errors='ignore') as fin: raws = fin.read().lower() # cari data raw yang ada pada file raw data tokenSent = nltk.sent_tokenize(raws) tokenWord = nltk.word_tokenize(raws) # preprocessing text atau raw data lmr = WordNetLemmatizer() def lmTokens(tokens): return [lmr.lemmatize(token) for token in tokens] removePunctDictionary = dict( (ord(punct), None) for punct in string.punctuation) def lmNormalize(text):
line = line.replace("Mr. Weasley", "Arthur") line = line.replace("Mrs.", "Mistress") line = line.replace("Mr.", "Mister") text += line """neuralcoref doesn't do posession correctly. insert ['s] to concatenate later to character's name.""" i = 0 word_list = text.split() for word in word_list: if word in posessive_pronouns: word_list.insert(i + 1, "['s]") i += 1 text = " ".join(word_list) text = removePunctFromQuotes(text) tokens = nltk.sent_tokenize(text) sentences = [] for t in tokens: sentences.append(t) """anaphora resolution at single sentence level""" for sentence in sentences: doc = nlp(sentence) doc._.has_coref line = returnPunctFromQuotes(doc._.coref_resolved) f1.write(f"{line} \n") f1.close() print("single sentence completed") final_file = f"harrypotter{hp_num}_final.txt"
]) # use modified lexicon nrc.drop_duplicates('word', inplace=True) nrc['value'] = nrc['positive'] - nrc['negative'] # Load data from Mongo mongo = Mongo('facebook', 'comments') docs = [doc for doc in mongo.collection.find()] mongo.close() mongo_ids = [doc.pop('_id', None) for doc in docs] # exclude mongo generated ids docs = d_to_df(docs) docs['created_time'] = pd.to_datetime(docs['created_time'], format="%Y-%m-%dT%H:%M:%S+0000") docs.set_index('created_time', inplace=True) docs.drop_duplicates(['message', 'user.name', 'post_id'], inplace=True) docs['n_sents'] = docs.message.apply(lambda x: len(sent_tokenize(x))) docs['n_words'] = docs.message.apply(lambda x: len(tokenize.word_tokenize(x))) docs = docs[docs['n_sents'] != 0].copy() mongo = Mongo('facebook', 'posts') posts = [doc for doc in mongo.collection.find()] mongo.close() mongo_ids = [post.pop('_id', None) for post in posts] # exclude mongo generated ids posts = d_to_df(posts) posts['created_time'] = pd.to_datetime(posts['created_time'], format="%Y-%m-%dT%H:%M:%S+0000") posts.set_index('created_time', inplace=True) # Calculating post title and message sentiment posts['article_title'].fillna('', inplace=True)
def sentence_tokenizer(text): token_list = nltk.sent_tokenize(text, "english") return token_list
np.random.seed(42) BATCH_SIZE = 128 NUM_EPOCHS = 20 lines = [] fin = open("../data/alice_in_wonderland.txt", "rb") for line in fin: line = line.strip().decode("ascii", "ignore").encode("utf-8") if len(line) == 0: continue lines.append(line) fin.close() sents = nltk.sent_tokenize(" ".join(lines)) tokenizer = Tokenizer(5000) # use top 5000 words only tokens = tokenizer.fit_on_texts(sents) vocab_size = len(tokenizer.word_index) + 1 w_lefts, w_centers, w_rights = [], [], [] for sent in sents: embedding = one_hot(sent, vocab_size) triples = list(nltk.trigrams(embedding)) w_lefts.extend([x[0] for x in triples]) w_centers.extend([x[1] for x in triples]) w_rights.extend([x[2] for x in triples]) ohe = OneHotEncoder(n_values=vocab_size) Xleft = ohe.fit_transform(np.array(w_lefts).reshape(-1, 1)).todense()
strong not only as a military power but also as an economic power. Both must go hand-in-hand. My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material. I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. I see four milestones in my career""" # Cleaning the texts import re from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer ps = PorterStemmer() wordnet=WordNetLemmatizer() sentences = nltk.sent_tokenize(paragraph) corpus = [] for i in range(len(sentences)): review = re.sub('[^a-zA-Z]', ' ', sentences[i]) review = review.lower() review = review.split() review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] review = ' '.join(review) corpus.append(review) # Creating the Bag of Words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features = 1500) X = cv.fit_transform(corpus).toarray()
import nltk # 本文,大意是歐巴馬卸任 news_content='''At noon on Friday, 55-year old Barack Obama became a federal retiree. His pension payment will be $207,800 for the upcoming year, about half of his presidential salary. Obama and every other former president also get seven months of "transition" services to help adjust to post-presidential life. The ex-Commander in Chief also gets lifetime Secret Service protection as well as allowances for things such as travel, office expenses, communications and health care coverage. All those extra expenses can really add up. In 2015 they ranged from a bit over $200,000 for Jimmy Carter to $800,000 for George W. Bush, according to a government report. Carter doesn't get health insurance because you have to work for the federal government for five years to qualify. ''' # 分詞、標註、NER、打分數,依分數高低排列句子 results=[] for sent_no,sentence in enumerate(nltk.sent_tokenize(news_content)): no_of_tokens=len(nltk.word_tokenize(sentence)) # Let's do POS tagging tagged=nltk.pos_tag(nltk.word_tokenize(sentence)) # Count the no of Nouns in the sentence no_of_nouns=len([word for word,pos in tagged if pos in ["NN","NNP"] ]) #Use NER to tag the named entities. ners=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)), binary=False) no_of_ners= len([chunk for chunk in ners if hasattr(chunk, 'label')]) score=(no_of_ners+no_of_nouns)/float(no_of_tokens) results.append((sent_no,no_of_tokens,no_of_ners, no_of_nouns,score,sentence)) # 依重要性順序列出句子 for sent in sorted(results,key=lambda x: x[4],reverse=True): print(sent[5])
from nltk import sent_tokenize from urllib import request import random url = "https://www.gutenberg.org/files/61236/61236-0.txt" response = request.urlopen(url) raw = response.read().decode('utf8') sentence = sent_tokenize(raw) # result = random.choice(sentence) print(random.choice(sentence))
data = [x.strip() for x in data] jar = 'stanford-ner.jar' model = 'english.all.3class.distsim.crf.ser.gz' st = StanfordNERTagger(model, jar, encoding='utf8') #tokenized_sents = [[nltk.word_tokenize(str(sent)) for sent in nltk.sent_tokenize(str(line))] for line in lines] #classified_text = st.tag_sents(tokenized_sents) tokenized_sents = [] persons = [] entities = [] for line in data: sentences = nltk.sent_tokenize(line) #tokenize sentences tokenized_sents.append(nltk.word_tokenize(str(sentences))) for sentence in sentences: for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))): if (pos == 'NNS'): if (word not in entities): entities.append(word) classified_text = st.tag_sents(tokenized_sents) for item in classified_text: for x, y in item: if (y == 'PERSON'): if (x not in persons): persons.append(str(x))