def getDomainUnigram(self, directory = None): collocations = set() #collocation items ewordlists = list() #list of lists of words #extract words from essays if directory is not None: doclist = os.listdir(directory) for essay in doclist: dir_essay = directory+'/'+essay etext = open(dir_essay,'r').read() tokens = nltk.wordpunct_tokenize(etext) tokens = [word.lower() for word in tokens] #stemming if self._stemoption ==True: st = PorterStemmer() tokens = [st.stem(t) for t in tokens] #extract the collocation for the given essay e_bigram = set(Mytext(tokens).collocations()) collocations = collocations | e_bigram ewordlists.append(tokens) else: # using the mapped essay to calcuate the candidate bigrams #need to call mapessay fuction first for ins in self._data: if ins['essay'] is not None: etext = open(ins['essay'],'r').read() tokens = nltk.wordpunct_tokenize(etext) tokens = [word.lower() for word in tokens] #stemming if self._stemoption ==True: st = PorterStemmer() tokens = [st.stem(t) for t in tokens] #extract the collocation for the given essay e_bigram = set(Mytext(tokens).collocations()) collocations = collocations | e_bigram ewordlists.append(tokens) #get collection of all essays under the specified directory / associated essays collection_text = TextCollection(ewordlists) itemlist = list() for (a, b) in collocations: itemlist.append(a) itemlist.append(b) itemlist = list(set(itemlist)) word_idf = [] for i in range(len(itemlist)): word_idf.append((collection_text.idf(itemlist[i]), itemlist[i])) word_idf = sorted(word_idf, key = operator.itemgetter(0)) ave = 0 if len(word_idf)!=0: ave = sum(map(operator.itemgetter(0), word_idf)) / len(word_idf) wlist = [j for (i, j) in word_idf if i<ave] return wlist
def do_it(self, sources): for source in sources: words = nltk.wordpunct_tokenize(source.headline) words.extend(nltk.wordpunct_tokenize(source.summary)) lowerwords=[x.lower() for x in words if len(x) > 1] self.ct += 1 print self.ct, "TITLE",source.headline self.corpus.append(lowerwords) self.titles.append(source.headline) self.links.append(source.url) [[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus] self.ct=-1 for doc in self.corpus: self.ct+=1 print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus)) for document in self.corpus: vec=[] [vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list] self.feature_vectors.append(vec) self.n=len(self.corpus) mat = numpy.empty((self.n, self.n)) for i in xrange(0,self.n): for j in xrange(0,self.n): mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j]) Z = linkage(mat, 'single') dendrogram(Z, color_threshold=self.t) clusters = self.extract_clusters(Z,self.t,self.n) stories = [] for key in clusters: print "=============================================" story = Story() for id in clusters[key]: story.add_source(sources[id]) print id,self.titles[id],sources[id].url stories.append(story) return stories
def get_utterances(utterances, line, category, wgram, cgram): tknzr = TweetTokenizer() gram_list = [] # WORD GRAMS if wgram == 1: # unigram wgram_list = tknzr.tokenize(line) elif wgram == 2: # uni + bigram # unigram list tokens = nltk.wordpunct_tokenize(line) # bigram list finder = BigramCollocationFinder.from_words(tokens) scored = finder.score_ngrams(bigram_measures.raw_freq) bigram_list = sorted(bigram for bigram, score in scored) # res wgram_list = tknzr.tokenize(line) + bigram_list elif wgram == 3: # uni + bi + trigram # unigram list tokens = nltk.wordpunct_tokenize(line) # bigram list bi_finder = BigramCollocationFinder.from_words(tokens) bi_scored = bi_finder.score_ngrams(bigram_measures.raw_freq) bigram_list = sorted(bigram for bigram, biscore in bi_scored) # trigram list tri_finder = TrigramCollocationFinder.from_words(tokens) tri_scored = tri_finder.score_ngrams(trigram_measures.raw_freq) trigram_list = sorted(trigram for trigram, triscore in tri_scored) # res wgram_list = tknzr.tokenize(line) + bigram_list + trigram_list # CHAR GRAMS cgram_list = [] if cgram == 1: # uni-chargram cgram_list = [line[i:i+1] for i in range(len(line)-1)] elif cgram == 2: # bi-chargram cgram_list = [line[i:i+2] for i in range(len(line)-1)] elif cgram == 3: # tri-chargram cgram_list = [line[i:i+3] for i in range(len(line)-1)] # RESULT if category == 'QA': # non-task utterances.append((wgram_list + cgram_list, 0)) elif category == 'Shopping': # task utterances.append((wgram_list + cgram_list, 1)) elif category == 'Travel': # task utterances.append((wgram_list + cgram_list, 2)) elif category == 'Hotel': # task utterances.append((wgram_list + cgram_list, 3)) elif category == 'Food': # task utterances.append((wgram_list + cgram_list, 4)) elif category == 'Art': # task utterances.append((wgram_list + cgram_list, 5)) elif category == 'Weather': # task utterances.append((wgram_list + cgram_list, 6)) elif category == 'Friends': # task utterances.append((wgram_list + cgram_list, 7)) elif category == 'Chat': # chat utterances.append((wgram_list + cgram_list, 8)) else: print utt_category,"ERROR"
def getArticleKeywords(articles, maxLength=3): """ Parse titles of a number of articles and extract keywords that occur in them. A keyword is defined as a grouping of several words, with punctuation and stopwords (*nltk.corpus.stopwords.words('english')*) removed. Will also add keywords from every input Article into the corresponding entry in articles list. Arguments ---------- articles - a list of Articles. maxLength - int, the largest number of tokens per keyword. Returns ---------- 2-tuple with numpy.ndarrays of shape (len(articles),) with * strings of keywords * ints with the number of occurrences of the given keyword in all titles Example ---------- "A general theory of the plasma of an arc" would return keywords: ['A', 'general', 'theory', 'of', 'the', 'plasma', 'of', 'an', 'arc', 'A general', 'general theory', 'theory of', 'of the', 'the plasma', 'plasma of', 'of an', 'an arc', 'A general theory', 'general theory of', 'theory of the', 'of the plasma', 'the plasma of', 'plasma of an', 'of an arc'] Out of these, ['A','of','the','an','of the','of an'] would be filtered out. """ # Identify keywords. tokens=[] for title in [art.Title for art in articles]: tokens.extend(nltk.wordpunct_tokenize(title)) # Filter out meaningless words and punctuation. tokens=filter(lambda s: not s.lower() in nltk.corpus.stopwords.words('english') and not s in string.punctuation, tokens) # Find keywords (length 1, 2, or 3) and how often they occur in all the titles. keywords,frequencies=findNGrams(tokens,lengths=range(1,maxLength+1)) keywords=numpy.array(keywords) frequencies=numpy.array(frequencies) sortedIndices=frequencies.argsort()[::-1] # Go in descending order of frequencies. frequencies=frequencies[sortedIndices] keywords=keywords[sortedIndices] # Assign keywords to Articles. for i in range(len(articles)): artTitleTokens=nltk.wordpunct_tokenize(articles[i].Title) # The tokens of this article's title. # Filter out meaningless words and punctuation. artTitleTokens=filter(lambda s: not s.lower() in nltk.corpus.stopwords.words('english') and not s in string.punctuation, artTitleTokens) # Use the same algorithm but for this article only. artKeywords,artFreq=findNGrams(artTitleTokens,lengths=[1,2,3]) articles[i].Keywords=artKeywords return keywords,frequencies
def product_features(product): name = nltk.FreqDist(normalize_words(nltk.wordpunct_tokenize(product['name']))) desc = nltk.FreqDist(normalize_words(nltk.wordpunct_tokenize(product['description']))) feats = {} for word in name.keys(): feats['name(%s)' % word] = True for word in desc.keys(): feats['description(%s)' % word] = True return feats
def do_it(self): for feed in self.feeds: d = feedparser.parse(feed) for e in d['entries']: words = nltk.wordpunct_tokenize(self.clean_html(e['description'])) words.extend(nltk.wordpunct_tokenize(e['title'])) lowerwords=[x.lower() for x in words if len(x) > 1] self.ct += 1 print self.ct, "TITLE",e['title'] self.corpus.append(lowerwords) self.titles.append(e['title']) self.links.append(e['link']) [[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus] self.ct=-1 for doc in self.corpus: self.ct+=1 print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus)) for document in self.corpus: vec=[] [vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list] self.feature_vectors.append(vec) self.n=len(self.corpus) mat = numpy.empty((self.n, self.n)) for i in xrange(0,self.n): for j in xrange(0,self.n): mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j]) Z = linkage(mat, 'single') dendrogram(Z, color_threshold=self.t) clusters = self.extract_clusters(Z,self.t,self.n) for key in clusters: print "=============================================" for id in clusters[key]: print id,self.titles[id]
def jaccard(sen_1, sen_2): tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_1)) words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')] sen_set_1 = set(words) tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_2)) words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')] sen_set_2 = set(words) jaccard_value = jaccard_distance(sen_set_1, sen_set_2) return jaccard_value
def main(): stem = nltk.stem.LancasterStemmer() cleanword = lambda w : stem.stem(w.strip(w).lower()) bib = btparse.load(sys.argv[1]) aid = np.random.randint(len(bib)) while ('abstract' in bib[aid].keys()) == False: aid = np.random.randint(len(bib)) abstract = nltk.wordpunct_tokenize(bib[aid]['abstract']+" "+bib[aid]['title']) q_vec0 = sorted([x[0] for x in nltk.pos_tag(abstract) if x[1] in ("NN")]) q_vec = [] q_val = [] for w in q_vec0: w = cleanword(w) if len(w)>2 and w not in ignore_list and re.search('\\\\',w) == None: if (w in q_vec) == False: q_vec.append(w) q_val.append(1) else: q_val[-1] += 1 q_val = np.array(q_val)/np.sqrt(np.dot(q_val,q_val)) prob = np.zeros(len(bib)) if pytools: progress = pytools.ProgressBar("Analysing",len(bib)) progress.draw() for ind,entry in enumerate(bib): if ind != aid and ('abstract' in bib[ind].keys()): abstract = nltk.wordpunct_tokenize(bib[ind]['abstract']+" "+bib[ind]['title']) r_vec = sorted([x[0] for x in nltk.pos_tag(abstract) if x[1] in ("NN")]) r_val = np.zeros(len(q_val)) for w in r_vec: w = cleanword(w) if w in q_vec: r_val[q_vec.index(w)] += 1 mod = np.dot(r_val,r_val) if mod > 0: prob[ind] = np.dot(r_val/np.sqrt(mod),q_val) if pytools: progress.progress() if pytools: print "" # sort based on probability (best first) inds_sort = np.argsort(prob)[::-1] print 'similar papers to:\n\t%s\n\t\tby: %s\n'%(bib[aid]['title'],bib[aid]['author']) for i in range(10): best = inds_sort[i] print '%3d.\t%s\n\t\tby: %s\n\t\tid = %3d, prob = %f\n'%(i+1,bib[best]['title'],bib[best]['author'],best,prob[best])
def feedTech(request): corpus = [] titles=[] ct = -1 for feed in feeds: d = feedparser.parse(feed) for e in d['entries']: words = nltk.wordpunct_tokenize((e['description'])) words.extend(nltk.wordpunct_tokenize(e['title'])) lowerwords=[x.lower() for x in words if len(x) > 1] ct += 1 print (ct, "TITLE",e['title']) corpus.append(lowerwords) titles.append(e['title']) return render(request, 'dash/feeds.html')
def tag_files_for_cross_validation(file_list, tmp_models): # first clean CV files folder if os.path.exists(CV_FILES_PATH_DEFAULT): shutil.rmtree(CV_FILES_PATH_DEFAULT) if os.path.exists(CV_FILES_PATH_PUNCT): shutil.rmtree(CV_FILES_PATH_PUNCT) if os.path.exists(CV_FILES_PATH_LOWER): shutil.rmtree(CV_FILES_PATH_LOWER) if os.path.exists(CV_FILES_PATH_LOWER_PUNCT): shutil.rmtree(CV_FILES_PATH_LOWER_PUNCT) # then create new CV folders os.makedirs(CV_FILES_PATH_DEFAULT) os.makedirs(CV_FILES_PATH_PUNCT) os.makedirs(CV_FILES_PATH_LOWER) os.makedirs(CV_FILES_PATH_LOWER_PUNCT) for file_name in file_list: path = ORIGINAL_STORIES + '/' + file_name + '.txt' if not os.path.isfile(path): print('File ' + path + ' does not exist!') continue content = get_content(path) content_lower = content.lower() tokenized_content = nltk.wordpunct_tokenize(content) tokenized_content_punct = nltk.word_tokenize(content) tokenized_content_lower = nltk.wordpunct_tokenize(content_lower) tokenized_content_lower_punct = nltk.word_tokenize(content_lower) tagged_content = tag_tokens_with_model(tokenized_content, tmp_models.default, lowercase=False, message=False) tagged_file_path = CV_FILES_PATH_DEFAULT + '/' + file_name + '.tsv' write_tagged_content_to_file(tagged_content, tagged_file_path, message=False) tagged_content = tag_tokens_with_model(tokenized_content_punct, tmp_models.punct, lowercase=False, message=False) tagged_file_path = CV_FILES_PATH_PUNCT + '/' + file_name + '.tsv' write_tagged_content_to_file(tagged_content, tagged_file_path, message=False) tagged_content = tag_tokens_with_model(tokenized_content_lower, tmp_models.lower, lowercase=True, message=False) tagged_file_path = CV_FILES_PATH_LOWER + '/' + file_name + '.tsv' write_tagged_content_to_file(tagged_content, tagged_file_path, message=False) tagged_content = tag_tokens_with_model(tokenized_content_lower_punct, tmp_models.lower_punct, lowercase=True, message=False) tagged_file_path = CV_FILES_PATH_LOWER_PUNCT + '/' + file_name + '.tsv' write_tagged_content_to_file(tagged_content, tagged_file_path, message=False)
def main(): text = open('holmes.txt').read() tokens = nltk.wordpunct_tokenize(text) charList = [] for word in tokens: for char in word: charList.append(char) fDistChars = nltk.FreqDist(charList) fDistWords = nltk.FreqDist(tokens) print("Answer to 1A, there are {} character types in the book, namely: \n{}".format(len(fDistChars),sorted(fDistChars))) print("\nAnswer to 1B, there are {} word types in the book, namely: \n{}".format(len(fDistWords),sorted(fDistWords))) bigramChars = nltk.bigrams(charList) trigramChars = nltk.trigrams(charList) print("\nAnswer to 1C, the 20 most common characters are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(charList), most_common(bigramChars), most_common(trigramChars))) bigramWords = nltk.bigrams(tokens) trigramWords = nltk.trigrams(tokens) print("\nAnswer to 1D, the 20 most common words are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(tokens), most_common(bigramWords), most_common(trigramWords))) bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokens) scoredPMI = finder.score_ngrams(bigram_measures.pmi) scoredCHI = finder.score_ngrams(bigram_measures.chi_sq) print("\nAnswer to 2, the 20 most likely collocations are:\nPMI:\n{} \nChi's square\n{}" .format(scoredPMI[:20],scoredCHI[:20])) print("\nSpearmans correlation = {}".format(nltk.metrics.spearman.spearman_correlation(scoredPMI, scoredCHI)))
def calculate_language_scores(text): """ Calculate probability of given text to be written in several languages and return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}. :param text: Text to analyze. :type text: str :return: Dictionary with languages and unique stopwords seen in analyzed text. :rtype: dict(str -> int) :raises: TypeError """ if not isinstance(text, basestring): raise TypeError("Expected basestring, got '%s' instead" % type(text)) if not text: return {} languages_ratios = {} # Split the text into separate tokens, using natural language punctuation signs. tokens = wordpunct_tokenize(text) tokenized_words = [word.lower() for word in tokens] for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(tokenized_words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios
def descripsToWords(dataFrameOfWords): # intial parsing tokens = nltk.wordpunct_tokenize(' '.join(dataFrameOfWords)) text = nltk.Text(tokens) words = [w.lower() for w in text] vocab = sorted(set(words)) # remove words in removeWords list and punctuation removeWords = {'rosario'} # just an example filtered_words = [word for word in words if word not in removeWords] filtered_words = [w for w in filtered_words if w.isalnum()] words = filtered_words # check for valid English import enchant d = enchant.Dict("en_US") wordsValid = [] for w in words: if d.check(w): wordsValid.append(w) else: wordsValid.append(d.suggest(w)[0]) words = wordsValid return words, vocab
def get_words(text, min_length = None, max_length = None): """ Parse the given text as natural language and extract words from it. Optionally filter the words by minimum and/or maximum length. :param text: Text to parse. :type text: str :param min_length: Minimum length required. Use None for no limit. :type min_length: int | None :param min_length: Maximum length allowed. Use None for no limit. :type min_length: int | None :return: Set of unique words extracted from the text. :rtype: set(str) """ # Split the text into separate tokens, using natural language # punctuation signs. Then filter out by min/max length, and tokens # that aren't strictly alphabetic. Finally, convert the words to # lowercase form. return { word.lower() for word in wordpunct_tokenize(text) if ( word.isalpha() and (min_length is None or len(word) >= min_length) and (max_length is None or len(word) <= max_length) ) }
def statScore(text,d_index): tokens = nltk.wordpunct_tokenize(text) val = 0 for token in tokens: w_index = vocabulary.index(token) val = val + self.stat_lte[w_index][d_index] return val
def translateHinglishTweets(tweets_text): counter = 0 tweets_text_translated = [] n = len(tweets_text) open_file = open("dictionary.pickle", "rb") dictionary = pickle.load(open_file) open_file.close() english_stopwords_set = set(stopwords.words('english')) for i in range(n): text = tweets_text[i] translated_text = "" tokens = wordpunct_tokenize(text) words = [word.lower() for word in tokens] for word in words: if word in english_stopwords_set: translated_text = translated_text + " " + word elif (word in dictionary): #print word + "-" + dictionary[word] translated_text = translated_text + " " + dictionary[word] counter = counter + 1 else: translated_text = translated_text + " " + word tweets_text_translated.append(translated_text) #print counter return tweets_text_translated
def convert_to_weka(src, des, voc): stemmer = nltk.LancasterStemmer() word_reg = re.compile('[0-9A-Za-z]+') des.write('@relation review_rate\n') des.write('\n') for word in voc: des.write('@attribute ' + word + ' real\n') des.write('@attribute rate {s1,s2,s3,s4,s5}\n') des.write('\n') des.write('@data\n') for line in iter(src.readline, ''): feature_vector = [] try: rate, title, review = [item.strip() for item in line.split('\t')[5:8]] except (IndexError, ValueError): continue ws = set([]) for w in nltk.wordpunct_tokenize(title + ' ' + review): m = word_reg.match(w) if m: ws.add(stemmer.stem(m.group(0).lower())) for w in voc: if w in ws: feature_vector.append('1') else: feature_vector.append('0') des.write(','.join(feature_vector) + ',' + 's' + str(int(math.ceil(float(rate)))) + '\n') return
def tokenize(text): """This handles tokenizing and normalizing everything.""" return [ token.lower() for token in nltk.wordpunct_tokenize(text) if token.isalnum() ]
def feature_extractor(data): """Extract features from a relation for the classifier.""" features = dict() lmtzr = WordNetLemmatizer() h2, h3, paragraph = data features['h2_' + h2.lower()] = True for word in h2.split(' '): if word.lower() not in stopwords.words('english') and len(word) > 1: features['h2word_' + word.lower()] = True features['h_' + h2.lower()] = True for word in h2.split(' '): if word.lower() not in stopwords.words('english') and len(word) > 1: features['hword_' + word.lower()] = True if h3 != None: features['h3_' + h3.lower()] = True for word in h3.split(' '): if word.lower() not in stopwords.words('english') and len(word) > 1: features['h3word_' + word.lower()] = True features['h_' + h3.lower()] = True for word in h3.split(' '): if word.lower() not in stopwords.words('english') and len(word) > 1: features['hword_' + word.lower()] = True for word in nltk.wordpunct_tokenize(paragraph): if word.lower() not in stopwords.words('english') and len(word) > 1: features[word] = True features['lower_' + word.lower()] = True features['lmtzr_' + lmtzr.lemmatize(word).lower()] = True return features
def get_vocabulary(utterances): token_list = [] for utt in utterances: utt_content = utt[0] token_list += nltk.wordpunct_tokenize(utt_content) token_set = set(token_list) return token_set
def words(self): """ Returns a generator of words. """ for sent in self.sents(): for word in nltk.wordpunct_tokenize(sent): yield word
def write_to_mod_html_file(sentences,locs,tex): global count g_dic = group_locs_by_sentences(locs) ll= [] for l in g_dic.keys(): ll.append(l) ll.sort(cmp=cmp_by_ind) for (x,y) in ll: l = g_dic[(x,y)] sen = sentences[x] slash_n_split = sen.splitlines() wds = reg_remove_special_chars.sub(r' ',slash_n_split[y]) words = nltk.wordpunct_tokenize(wds) l.sort(cmp=cmp_by_ind) for (h,k) in l: words[h] = """<i style="color:red">"""+words[h] words[k] = words[k]+'</i>' words = ' '.join(words) slash_n_split[y] = words sentences[x] = '\n'.join(slash_n_split) t = '\n'.join(sentences) f = open('html/%d_mod.html'%count, "w") t = reg_replace_slashn.sub(r'<br/>',t) f.write(t) f.close() count +=1
def word_feats(words): feats={} words=words.strip() hasbadw=0 hasyou=0 sentences=0 for sentense in re.split(r' *[\.\?!]["\)\]]* *', words): sentences+=1 for word in nltk.wordpunct_tokenize(sentense): for curse in badwords: if word.lower().endswith(curse.lower()) or word.lower().startswith(curse.lower()): hasbadw+=1 break if word.lower() in ("you","u","ur","your","urs","urz","yours"): hasyou+=1 feats["you"]=hasyou feats["badw"]=hasbadw feats["length"]= len(words) feats["caps"]=len(re.findall('[A-Z]', words)) feats["smalls"]=len(re.findall('[a-z]', words)) feats["sentences"]=sentences feats["capsratio"]=float(feats["caps"])/len(words) featslist=[] for k,v in feats.iteritems(): featslist.append(v) return featslist
def requirementAnalysis(fileArchimate=None): if fileArchimate is None: fileArchimate = u"/Users/morrj140/Documents/SolutionEngineering/Archimate Models/DVC v38.archimate" al = ArchiLib(fileArchimate) conceptsFile = fileConceptsRequirements searchTypes = list() searchTypes.append(u"archimate:Requirement") nl = al.getTypeNodes(searchTypes) logger.info(u"Find Words in Requirements...") concepts = Concepts(u"Requirement", u"Requirements") n = 0 for sentence in nl: n += 1 logger.debug(u"%s" % sentence) c = concepts.addConceptKeyType(u"Document" + str(n), u"Document") d = c.addConceptKeyType(sentence, u"Sentence" + str(n)) if True and sentence is not None: cleanSentence = ' '.join([word for word in sentence.split(u" ") if word not in stop]) for word, pos in nltk.pos_tag(nltk.wordpunct_tokenize(cleanSentence)): if len(word) > 1 and pos[0] == u"N": e = d.addConceptKeyType(word, u"Word") f = e.addConceptKeyType(pos, u"POS") Concepts.saveConcepts(concepts, conceptsFile) logger.info(u"Saved : %s" % conceptsFile) chunks = Chunks(concepts) chunks.createChunks()
def parse(self): # Creates a single list from list_strings self.word_list.append(' '.join(self.list_strings)) # Divides the single list string into substrings representing a word self.word_list = nltk.sent_tokenize(str(self.word_list[0])) # Separates punctuation for sentence in self.word_list: self.word_list = nltk.wordpunct_tokenize(sentence.lower()) # Remove all stop words in big_string self.word_list = [w for w in self.word_list if w not in self.stop_words] # print "List with stopwords removed: " + str(self.word_list) stemmer = nltk.PorterStemmer() # Stemmer is used to normalize adjective, adverbs, and verbs as well as making sure # that plural and singular words become the same self.word_list = [stemmer.stem(word) for word in self.word_list] # Removes the unicode formatting produced by the stemmer self.word_list = [str(word) for word in self.word_list] # Creates a frequency distribution based on words in self.word_list fdist = nltk.FreqDist(self.word_list) fdist = fdist.most_common(self.top_n) return fdist
def findWinners(tweeters, categories): awardResult = {} THRESHOLD = 200 awardPat = re.compile("best .*",re.IGNORECASE) winnerPat = re.compile(".*win.*",re.IGNORECASE) for twtr in tweeters: tweets = twtr.tweets for tweet in tweets: if winnerPat.match(tweet.text): cleanTweet = sanitizeTweet(tweet.text) award = awardPat.search(cleanTweet) if award: properNoun =[] firstHalfOfTweet = re.search("(?i).*(?=win)",cleanTweet) tokenizedText = nltk.wordpunct_tokenize(firstHalfOfTweet.group()) if tokenizedText: properNoun = extractProperNouns(tokenizedText) award = sanitizeAwardName(award.group()) mostSimilarAward = findSimilarCategory(award, categories) if mostSimilarAward in awardResult: awardResult[mostSimilarAward] +=properNoun else: awardResult[mostSimilarAward] = properNoun THRESHOLD = THRESHOLD -1 if THRESHOLD<1: print("THRESHOLD MET") break sanitizeAwardResult(awardResult)
def findBestWorstDress(tweeters): possibleBestDress = [] possibleWorstDress = [] bestDressPat = re.compile(".*best dress.*",re.IGNORECASE) worstDressPat = re.compile(".*worst dress.*",re.IGNORECASE) pat = "" for twtr in tweeters: for twt in twtr.tweets: properNoun =[] if bestDressPat.match(twt.text): pat = "best" elif worstDressPat.match(twt.text): pat = "worst" else: continue firstHalfOfTweet = re.search("(?i).*(?=%s)" % pat,twt.text) tokenizedText = nltk.wordpunct_tokenize(firstHalfOfTweet.group()) if tokenizedText: properNoun = extractProperNouns(tokenizedText) for pn in properNoun: if len(pn.split())==2 : if pat == 'best': possibleBestDress.append(pn) else: possibleWorstDress.append(pn) bestData = collections.Counter(possibleBestDress) worstData = collections.Counter(possibleWorstDress) print("\n\nList of Best Dressed:\n========================") for host in bestData.most_common()[0:5]: print(host[0]) print("\n\nList of Worst Dressed:\n========================") for host in worstData.most_common()[0:5]: print(host[0])
def _calculate_languages_ratios(text): """ Calculate probability of given text to be written in several languages and return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0} @param text: Text whose language want to be detected @type text: str @return: Dictionary with languages and unique stopwords seen in analyzed text @rtype: dict """ languages_ratios = {} ''' nltk.wordpunct_tokenize() splits all punctuations into separate tokens ''' tokens = wordpunct_tokenize(text) words = [word.lower() for word in tokens] # Compute per language included in nltk number of unique stopwords appearing in analyzed text for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios
def get_bigram_dict(filename): input_file = codecs.open(filename, 'r', encoding='utf8') content = input_file.read() dic = {} tokens = nltk.wordpunct_tokenize(content) finder = BigramCollocationFinder.from_words(tokens) return finder.ngram_fd
def findPresenters(twtrs): possiblePresenters = {} patterns = ["presenting an award", "presenting for best", "presenting best", "presents .* best", "presenting at the", "presents at the", "is presenting"] for twtr in twtrs: for twt in twtr.tweets: text = twt.text for pattern in patterns: rePat = re.compile(".* %s .*" % pattern, re.IGNORECASE) if rePat.match(text): cleanText = re.search("(?i).*(?=%s)" % pattern, text).group() cleanText = sanitizeTweetForPresenters(cleanText) if cleanText: properNouns = extractProperNouns(nltk.wordpunct_tokenize(cleanText)) for properNoun in properNouns: properNoun = sanitizeSlang(properNoun) if len(properNoun.split()) >= 2 and not properNoun.isupper(): if properNoun not in possiblePresenters: possiblePresenters[properNoun] = twtr.score else: possiblePresenters[properNoun] = possiblePresenters[properNoun] + twtr.score break sorted_presenters = OrderedDict(sorted(possiblePresenters.items(), key=lambda possiblePresenters: possiblePresenters[1], reverse=True)) print("\n\nList of Presenters:\n========================") for presenter in sorted_presenters.keys(): if sorted_presenters[presenter] > 0: print(presenter, sorted_presenters[presenter])
def classify(url): try: url = url.replace("*", "/") complete_data = [] response = requests.get(url) soup = BeautifulSoup(response.content, 'html5lib') for script in soup(["script", "style"]): script.extract() raw_data = soup.get_text() words = set(nltk.corpus.words.words()) raw_data = " ".join(w for w in nltk.wordpunct_tokenize(raw_data) if w.lower() in words) nlp = spacy.load("en") file_text = nlp(raw_data) words = [ token.lemma_ for token in file_text if not token.is_punct and not token.like_num and not token.is_space and not token.is_stop ] strip_data = [ token.lower() for token in words if not len(token.strip()) < 2 and not len(token.strip()) > 15 ] if len(strip_data) > 30: frequencies_words = FreqDist(strip_data).most_common(100) words_most_frequent = [word[0] for word in frequencies_words] untokenize_data = TreebankWordDetokenizer().detokenize(strip_data) complete_data.append(untokenize_data) vocabalary = pickle.load(open(configuration.vocabulary_path, "rb")) data = vocabalary.transform(complete_data) with open(configuration.classifier_model_path, 'rb') as fid: model_load = cPickle.load(fid) y_predict = model_load.predict(data) array_percentage = model_load.predict_proba(data) array_percentage = array_percentage * 100 print(array_percentage[:].round(2)) file = open(configuration.website_category_path, "r+") output = file.read() dic = json.loads(output) file.close() target_dict = {} category_url_list = [] for key, value in dic.items(): target_dict[int(key)] = value category_url_list.append(value) print(target_dict) print(category_url_list) result_percent = array_percentage[:, y_predict[0] - 1][0] result_percent = result_percent.round(2) if result_percent > 30: if len(strip_data) < 500: return ( str(target_dict[y_predict[0]]), "Note: Classification result may be inaccurate due to minimal content in the website and it's accuracy is " + str(result_percent) + " %", "You can add you own category for your website. If the name of the category peresent in the below list use that name. Or else create your own in this format http://127.0.0.1:8000/(url)?category=(category). For exmaple: http://127.0.0.1:8000/https:**www.mdpi.com*journal*agriculture?category=agriculture", category_url_list, words_most_frequent) else: return ( target_dict[y_predict[0]], "Accuracy of the classification is " + str(result_percent) + " %", "You can add you own category for your website. If the name of the category peresent in the below list use that name. Or else create your own in this format http://127.0.0.1:8000/(url)?category=(category). For exmaple: http://127.0.0.1:8000/https:**www.mdpi.com*journal*agriculture?category=agriculture", category_url_list, words_most_frequent) else: return ( "Given website is not related to space, job portal, adult, animals, news category", "May be it is related to " + str(target_dict[y_predict[0]]) + " and it's accuracy is " + str(result_percent) + " %", "You can add you own category for your website. If the name of the category peresent in the below list use that name. Or else create your own in this format http://127.0.0.1:8000/(url)?category=(category). For exmaple: http://127.0.0.1:8000/https:**www.mdpi.com*journal*agriculture?category=agriculture", category_url_list, words_most_frequent) else: return ( "Can't extract content from the website", "Site may be invalid or unavailable or having very few content", "Not available", "Not available", "Not available") except Exception as e: return ("Facing error while parsing the website", e, "Not available", "Not available", "Not available")
#编码解码 type(html) html=html.decode() type(html) # In[34]: #清洗语料获得纯文本 #raw=nltk.clean_html(html)-不成了,用bs4提供的函数就好 # http://www.crummy.com/software/BeautifulSoup raw=BeautifulSoup(html).get_text() tokens=nltk.wordpunct_tokenize(raw) #print(tokens) #print(type(tokens)) raw1=raw[750:23506] #print(raw1) text=nltk.Text(tokens) words=[w.lower()for w in text] vocab = sorted(set(words)) print('word:',vocab,len(words),'++',len(vocab)) print('token:',tokens,len(tokens)) # In[19]: tokens=tokens[96:399]
import nltk from nltk import word_tokenize, wordpunct_tokenize from nltk.util import ngrams classifier = nltk.data.load("classifiers/plusminus.pickle") openfile = open('hello.txt', 'r') twe = openfile.read() words = wordpunct_tokenize(twe) feats = dict([(word, True) for word in words + ngrams(words, 2)]) xa = classifier.classify(feats) if xa == 'pos': classifier = nltk.data.load("classifiers/happyfunny.pickle") posi = classifier.classify(feats) if posi == 'pos': openfile = open('mood.js', 'w') openfile.write('var md=1;') elif posi == 'neg': openfile = open('mood.js', 'w') openfile.write('var md=2;') elif xa == 'neg': classifier = nltk.data.load("classifiers/sadangry.pickle") posi = classifier.classify(feats) if posi == 'pos': openfile = open('mood.js', 'w') openfile.write('var md=3;') elif posi == 'neg': openfile = open('mood.js', 'w') openfile.write('var md=4;')
def tokenize_text(input_str: str = "") -> list: return nltk.wordpunct_tokenize(input_str)
from nltk import wordpunct_tokenize, pos_tag, ne_chunk import nltk nltk.download('maxent_ne_chunker') nltk.download('words') f = open('input.txt', 'r', encoding='utf-8') input = f.read() stokens = nltk.sent_tokenize(input) for i in stokens: print(ne_chunk(pos_tag(wordpunct_tokenize(i))))
stopwords = [ 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'be', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'as', 'of', 'at', 'by', 'for', 'with', 'about', 'above', 'below', 'into', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'then', 'here', 'there', 'when', 'where', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'i', 'me', 'my', 'myself', 'you', 'your', 'yourself', 'we', 'us', 'ourselves', 'ourself', 'll', 've', 'd', 're', 'm' ] #preprocessing tokens = nltk.wordpunct_tokenize(engltext.decode('utf8')) text = nltk.Text(tokens) englwords = [w.lower() for w in text if w.isalpha()] englwords = [w for w in englwords if w not in stopwords] englwords = nltk.pos_tag(englwords) #lemmatization of input text wordnet_lemmatizer = WordNetLemmatizer() engltextlemmas = [] for w, pos in englwords: if get_wordnet_pos(pos): engltextlemmas.append( wordnet_lemmatizer.lemmatize(w, get_wordnet_pos(pos))) else: engltextlemmas.append(wordnet_lemmatizer.lemmatize(w))
def stemWords(text): tokens = nltk.wordpunct_tokenize(text) return ' '.join([ self.stemmer.stem(t).lower() for t in tokens ])
def rmStopWords(text): stopwords = nltk.corpus.stopwords.words('english') tokens = nltk.wordpunct_tokenize(text) nostop = [t for t in tokens if t not in stopwords] return ' '.join(nostop)
def is_english(self, text): text = text.lower() words = set(nltk.wordpunct_tokenize(text)) return len(words & self.ENGLISH_STOPWORDS) > len(words & self.NON_ENGLISH_STOPWORDS)
from nltk import ngrams, ne_chunk, wordpunct_tokenize, pos_tag with open('output.txt', 'r', encoding='utf-8') as f: raw = f.read() #Tokenization wtokens = nltk.word_tokenize(raw) words = [word.lower() for word in wtokens if word.isalpha()] print(words) #Adding tag print(nltk.pos_tag(words)) lStem = LancasterStemmer() print( "Lancaster Stemming :----------------------------------------------------- \n" ) for tok in words: print(lStem.stem(str(tok))) lemmatizer = WordNetLemmatizer() print( "Lemmatization ------------------------------------------------------------:\n" ) for tok in words: print(lemmatizer.lemmatize(str(tok))) print("Trigrams --------------------------------------------:\n") trigram = [] x = 0 trigram.append(list(ngrams(words, 3))) print(trigram) print("NER-------------------------------------\n") print("NER : \n", ne_chunk(pos_tag(wordpunct_tokenize(str(words)))))
def do_process(file_list): tokenizer = nltk.RegexpTokenizer(r'\w+') stop_words = set(stopwords.words('english')) docs = {} i = 1 for file in file_list: lowercased = file.lower() #words=re.findall(r'[a-zA-Z]+[\w\']+',lowercased) words = nltk.wordpunct_tokenize(lowercased) # words = [w for w in words if (w.isalpha() and len(w) != 1 and w not in stop_words)] words = [w for w in words if (w.isalpha() and len(w) != 1)] words_set = sorted(set(words)) word_dic = {} for word in words_set: word_dic[word] = words.count(word) word_sorted = collections.OrderedDict(sorted(word_dic.items())) # word_sorted = sorted(word_hashmap.items(), key=operator.itemgetter(0)) # headers=['term', 'frequency'] # print(tabulate(word_hashmap_sorted, headers=headers)) doc_name = "document%d" % i + ":" i += 1 docs[doc_name] = word_sorted # print("\n## "+doc_name) # print("## full length %d" % len(words)) # print("## set length %d" % len(words_set)+"\n") f = open('answers.txt', 'w+') # index 140323F k = 23 docs = collections.OrderedDict(sorted(docs.items())) f.write("140323F\n") f.write('1\n') for key, value in docs.items(): s = key + "%d" % len(value) f.write(s + "\n") f.write("\n") f.write('2\n') for key, value in docs.items(): kth_key = list(value)[k] s = key + "" + kth_key + ",%.3f" % tf_w(value[kth_key]) f.write(s + "\n") f.write("\n") f.write('3\n') for key, value in docs.items(): kth_key = list(value)[k] s = key + "" + kth_key + ",%.3f" % idf(kth_key, docs) f.write(s + "\n") f.write("\n") f.write('4\n') for key, value in docs.items(): sorted_tf_idf_tuple_list = sorted_by_tf_idf(value, docs) i = 1 s = key for item in sorted_tf_idf_tuple_list: if i < 11: s += item[0] if i != 10: s += "," i += 1 # print(item[0],item[1]) else: continue f.write(s + "\n") f.close() f = open('answers.txt', 'r') written = f.read() f.close() print("wrote to file\n\n" + written)
def dummy_reader(): article = dummy_articles().next() reader = Mock() nltkwrapper.PlaintextCorpusReader = Mock(return_value=reader) reader.words.return_value = nltk.wordpunct_tokenize(article) reader.sents.return_value = [nltk.wordpunct_tokenize(article) for sent in nltk.sent_tokenize(article)]
def tokenizator(html): page_content = BeautifulSoup(html).get_text() result = list(nltk.wordpunct_tokenize(page_content)) result = minus_znak_prep(result) result = list(filter(minus_incorrect_sym, result)) return result
data = """When forty winters shall besiege thy brow, And dig deep trenches in thy beauty's field, Thy youth's proud livery so gazed on now, Will be a totter'd weed of small worth held: Then being asked, where all thy beauty lies, Where all the treasure of thy lusty days; To say, within thine own deep sunken eyes, Were an all-eating shame, and thriftless praise. How much more praise deserv'd thy beauty's use, If thou couldst answer 'This fair child of mine Shall sum my count, and make my old excuse,' Proving his beauty by succession thine! This were to be new made when thou art old, And see thy blood warm when thou feel'st it cold.""" cut_tokens = nltk.wordpunct_tokenize(data) tokens = list() stop_words.add('.') stop_words.add('?') stop_words.add(')') stop_words.add(').') stop_words.add('(') stop_words.add('.(') stop_words.add(',') stop_words.add('/') stop_words.add('-') stop_words.add('_') stop_words.add('+') stop_words.add('$') stop_words.add('&') stop_words.add('!')
def remove_non_english_words(text): words = set(nltk.corpus.words.words()) result = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha()) return result
def has_blog_candidate(description): words = set(nltk.wordpunct_tokenize(description)) return len(blog_keywords & words) > 0
def tokenize(data): return nltk.wordpunct_tokenize(data)
def extract(self, text): tokens = nltk.wordpunct_tokenize(text) result = [] for t in tokens: result.append((0, 0, self._stemmer.stem(t), 1.0)) return result
for row in reader: documents.append(row[3]) documents = documents[:100] filtered_text = [] stop_words = stopwords.words("russian") stop_words.extend(['rt']) ps = PorterStemmer() wnl = WordNetLemmatizer() for d in documents: d = strip_all_entities(strip_links(d.lower())) tokens = nltk.wordpunct_tokenize(d) filtered_tokens = [ w for w in tokens if (w not in stop_words and not di.check(w)) ] stemm_tokens = [(ps.stem(w)) for w in filtered_tokens] filtered_text.append(stemm_tokens) words = [] words.append("") words_count = [] words_count.append(0) for tokens in filtered_text: for token in tokens: if token not in words: words.append(token)
def get_LDA(self): articles = self.petitionDocs cur=articles.find({"GT"}, no_cursor_timeout=True) cur1=cur.sort('user_id', 1) cursor=cur1.limit(int(self.Eighty)) doc_complete=[] doc_clean=[] doc_completeT=[] doc_cleanT=[] tList=[] tListT = [] count=0 exclude = set(string.punctuation) stop = set(stopwords.words('english')) lines = open("stop3").read().splitlines() for word in lines: print word stop.add(word) mysqlStop = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"] for word in mysqlStop: stop.add(word) for p in cursor: #strip HTML tags from tweet strip = strip_tags(p['title'].encode('UTF-8')+" "+p['overview'].encode('UTF-8')) doc_complete.append(strip) strip1=TextCleaning.cleanURLEmailMention(self,strip) words = set(nltk.corpus.words.words()) EnCleanedDoc=" ".join(w for w in nltk.wordpunct_tokenize(strip1) \ if w.lower() in words or not w.isalpha()) EnCleanedDoc = unicode(EnCleanedDoc, errors='ignore') text = nltk.word_tokenize(EnCleanedDoc) posTag = nltk.pos_tag(text) countAdjAdv=0 countNounVerb=0 countAdjAdv=0 countNounVerb=0 for cat in posTag: if cat[1] in verbNoun: countNounVerb+=1 elif cat[1] in adverbAdjectives: countAdjAdv+=1 if countNounVerb ==0: expressivness=0 else: expressivness=float(countAdjAdv) / float(countNounVerb) cleaneddoc=TextCleaning.clean(self,EnCleanedDoc,stop,exclude) doc_clean.append(cleaneddoc) tList.append(p['petition_id']) count+=1 self.petitionDocs.update({"petition_id": p['petition_id']}, {"$set": {"LDA_cleanedDescription": EnCleanedDoc,"expressivness":expressivness}}, False, False) # list for tokenized documents in loop doc_tok = [doc.split() for doc in doc_clean] # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_tok) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_tok] # Prepare the testing dataset countT=0 cur=articles.find({}, no_cursor_timeout=True) cur1=cur.sort('user_id', -1) cursor=cur1.limit(int(self.Twenty)) for p in cursor: #strip HTML tags from tweet strip = strip_tags(p['title'].encode('UTF-8')+" "+p['overview'].encode('UTF-8')) doc_completeT.append(strip) strip1=TextCleaning.cleanURLEmailMention(self,strip) words = set(nltk.corpus.words.words()) EnCleanedDoc=" ".join(w for w in nltk.wordpunct_tokenize(strip1) \ if w.lower() in words or not w.isalpha()) EnCleanedDoc = unicode(EnCleanedDoc, errors='ignore') text = nltk.word_tokenize(EnCleanedDoc) posTag = nltk.pos_tag(text) countAdjAdv=0 countNounVerb=0 for cat in posTag: if cat[1] in verbNoun: countNounVerb+=1 elif cat[1] in adverbAdjectives: countAdjAdv+=1 if countNounVerb ==0: expressivness=0 else: expressivness=float(countAdjAdv) / float(countNounVerb) cleaneddoc=TextCleaning.clean(self,EnCleanedDoc,stop,exclude) doc_cleanT.append(cleaneddoc) tListT.append(p['petition_id']) countT+=1 self.petitionDocs.update({"petition_id": p['petition_id']}, {"$set": {"LDA_cleanedDescription": EnCleanedDoc,"expressivness":expressivness}}, False, False) # list for tokenized documents in loop doc_tokT = [doc.split() for doc in doc_cleanT] # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionaryT = corpora.Dictionary(doc_tokT) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrixT = [dictionaryT.doc2bow(doc) for doc in doc_tokT] # Creating the object for LDA model using gensim library Lda = gensim.models.ldamodel.LdaModel # Running LDA with different number of topics and getting the lowest preplexity topics=[10,30,40,50,80,100] perplexity=[] for top in topics: # Running and Trainign LDA model on the document term matrix. ldamodel = Lda(doc_term_matrix, num_topics=top, id2word=dictionary, passes=50) LDAOut=ldamodel.print_topics(num_topics=top, num_words=10) perplex = Lda.bound(ldamodel, doc_term_matrixT) with open('LDAout'+str(top)+'topics.txt', 'w') as f: print >> f, "-----------Run for "+str(top)+" topics ---------------------------------------------" print >> f, LDAOut print >> f, "-------------------------------------------------------------------------" print >> f, "perplexity ="+str(perplex) print >> f, "-----------Run for "+str(top)+" topics ---------------------------------------------" f.close() print "-----------Run for " + str(top) + " topics ---------------------------------------------" print LDAOut print "-------------------------------------------------------------------------" print "perplexity ="+str(perplex) print "-----------Run for "+str(top)+" topics ---------------------------------------------" # printing LDA results with word count parameter for each topic perplexity.append(perplex) plen=count for it in range(0,plen): print 'saving tweets topics probability distribution for topic '+str(top) try: post = {} # Prepare LDA topic scores for each petition post['LDA_topic'+str(top)] = ldamodel[doc_term_matrix[it]] print '' \ '' # Update topic score in database self.petitionDocs.update({'petition_id': tList[it]}, {"$set": post}, upsert=False) str1 = ''.join(str(post['LDA_topic'+str(top)])) print str(tList[it]) + ' has topics ' + str1 except Exception as e: print 'error in setting LDA for tweet: ' + str(tList[it]) print(e) # testing plen = countT for it in range(0, plen): print 'saving tweets topics probability distribution for topic ' + str(top) try: post = {} # Prepare LDA topic scores for each petition post['LDA_topic' + str(top)] = ldamodel[doc_term_matrixT[it]] print '' \ '' # Update topic score in database self.petitionDocs.update({'petition_id': tListT[it]}, {"$set": post}, upsert=False) str1 = ''.join(str(post['LDA_topic' + str(top)])) print str(tListT[it]) + ' has topics ' + str1 except Exception as e: print 'error in setting LDA for tweet: ' + str(tListT[it]) print(e) # Prepare the data xArr = np.array(topics) yArr = np.array(perplexity) # Plot the data fig = plt.figure() plt.plot(xArr, yArr, label='linear') fig.suptitle('Held-out per-word perplexity', fontsize=20) plt.xlabel('Number of Topics', fontsize=16) plt.ylabel('Perplexity', fontsize=16) # Show the plot plt.show() fig.savefig('perp')
# Write tag's fields to the output def save(output, tag): output.write('{0} {1}\n'.format(tag[0], tag[1])) if __name__ == '__main__': # Read text f = open('./data/input.txt', 'r') source = f.read() # Create stopwords dictionary from nltk stop_words = set(stopwords.words('english')) # Tokenize text tokens = nltk.wordpunct_tokenize(source) # Filter stop words tokens = [ i for i in tokens if (i not in string.punctuation and i.lower() not in stop_words) ] # Analyze each word morph = pymorphy2.MorphAnalyzer() # Convert tokens to tags tags = [morph.parse(i) for i in tokens] # Create ordered dict key=tag, value=freq ordered = collections.OrderedDict( sorted( # Convert each token to normal form and count frequency collections.Counter([t[0].normal_form for t in tags]).items(),
'http://feeds.reuters.com/reuters/technologyNews', 'http://www.tweaktown.com/news-feed/' ] import feedparser import nltk from bs4 import BeautifulSoup corpus = [] titles = [] ct = -1 for feed in feeds: d = feedparser.parse(feed) for e in d['entries']: soup = BeautifulSoup(e['description']) words = nltk.wordpunct_tokenize(soup.get_text()) words.extend(nltk.wordpunct_tokenize(e['title'])) lowerwords = [x.lower() for x in words if len(x) > 1] ct += 1 print ct, "TITLE", e['title'] corpus.append(lowerwords) titles.append(e['title']) import math from operator import itemgetter def freq(word, document): return document.count(word)
body = soup.find('div', {'class': 'mw-parser-output'}) file2.write(str(body.text)) with open('input.txt', 'r', encoding="utf8") as inputData: TextData = inputData.read().replace('\n', '') '''with open('input.txt',encoding='utf8') as data: text= data.read().strip()''' tokens = word_tokenize(TextData) pos = nltk.pos_tag(tokens) print(tokens[1:10]) print(pos[1:10]) from nltk.stem import PorterStemmer ps = PorterStemmer() for w in tokens: print(w, ":", ps.stem(w)) from nltk.stem import WordNetLemmatizer lem = WordNetLemmatizer() for m in tokens: print(m, ":", lem.lemmatize(m)) from nltk import ngrams trigram = ngrams(TextData.split(), 3) for gram in trigram: print(gram) print(str(trigram)) from nltk import wordpunct_tokenize, pos_tag, ne_chunk print(ne_chunk(pos_tag(wordpunct_tokenize(TextData))))
def tokenization(self): self.__sentence = nltk.wordpunct_tokenize(self.__sentence)
def remove_short_words(self, str, length=3): """Removes any word with length >= 3 (default) or user defined""" return " ".join(w for w in nltk.wordpunct_tokenize(str) if len(w) >= length)
def remove_stop_words(self, str): return " ".join(w for w in nltk.wordpunct_tokenize(str) \ if w not in self.stpwrds)
import os import pip import nltk from nltk.corpus import wordnet as wn from collections import defaultdict # Install googlesearch if not yet installed if 'googlesearch' not in map(lambda x: x.project_name, pip.get_installed_distributions()): os.system('sudo pip install googlesearch') from googlesearch import GoogleSearch as gs googleResults = lambda key: gs('%s and' % key).top_results() sentenceToWords = lambda sent: [w.lower() for w in\ nltk.Text(nltk.wordpunct_tokenize(sent))\ if w.isalpha() and len(w) > 1] # Get the adjectives adjs = [synset.lemma_names()[0] for synset in list(wn.all_synsets(wn.ADJ))] ''' Get the words that follows key and e.g. interesting as in weird and interesting with weird as key ''' def similarWords(key): rtn = [] query = '%s and ' % (key) search = gs(query).top_results() for result in search: content = sentenceToWords(result['content'])
for item in tribute_text: item = remove_tags(str(item)) item = item.replace('\n', ' ') cleaned_tribute_text.append(item) # print(cleaned_tribute_text) english_story_text = [] english_tribute_text = [] words = set(nltk.corpus.words.words()) for item in cleaned_story_text: # print(item) curr = " ".join(w for w in nltk.wordpunct_tokenize(item) \ if w.lower() in words or not w.isalpha()) english_story_text.append(curr) print(cleaned_story_text[0:10]) print(english_story_text[0:10]) stemmer = SnowballStemmer('english') def lemmatize_stemming(text): return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) def preprocess(text): result = [] for token in gensim.utils.simple_preprocess(text): if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
def get_nnp_ngrams(self, original_text, highlight=5, minsize=0): keywords_by_postion = [] minsize = minsize - 1 if minsize < 0: minsize = 0 tokens = nltk.wordpunct_tokenize(original_text) tagged = nltk.word_tokenize(original_text) i = 0 for t in tagged: tagged[i] = str(t) i = i + 1 tagged = nltk.pos_tag(tokens) doc_length = len(tokens) counter = 0 counter2 = 0 if highlight == 0: concated_test = doc_length # This is set to doc_length but could be anything recommend 3. else: concated_test = highlight list_of_NNPs = [] while counter < (doc_length - 1): while counter2 < concated_test: counter2 = counter2 + 1 counter3 = 0 temp_array = [] all_nnp = True while counter3 < counter2: if counter < (doc_length - counter3): temp_array.append(tokens[counter + counter3]) if tagged[counter + counter3][1] != 'NNP': all_nnp = False counter3 = counter3 + 1 counter3 = 0 if all_nnp == True: if (len(temp_array) > minsize): list_of_NNPs.append(temp_array) counter2 = 0 counter = counter + 1 for l in list_of_NNPs: str1 = ' '.join(l) if len(str1) < 3 or (not str1.isalnum()): list_of_NNPs.remove(l) unique_NNPs = list( list_of_NNPs for list_of_NNPs, _ in itertools.groupby(list_of_NNPs)) #discard punctuations unique_NNPs = self.discard_words_after_punct(unique_NNPs) unique_NNPs.sort() unique_NNPs_final = list( unique_NNPs for unique_NNPs, _ in itertools.groupby(unique_NNPs)) unique_NNPs_final.sort() #filter list to get max lenght n grams unique_NNPs_final = self.get_maxlength_ngram(unique_NNPs_final) unique_NNPs_final = self.remove_stopwords(unique_NNPs_final) unique_NNPs_final.sort() ##for removing empty ngrams unique_NNPs_final = list( unique_NNPs_final for unique_NNPs_final, _ in itertools.groupby(unique_NNPs_final)) if not unique_NNPs_final[0]: del unique_NNPs_final[0:1] #print unique_NNPs_final print "Keywords:" print unique_NNPs_final if len(tokens) > 200: for kw in unique_NNPs_final: print "kw[0]::" + kw[0] indx_NNP = tokens.index(kw[0]) #indx_NNP = indx[0] i = 1 flag = 0 for i in range(len(kw)): if tokens[indx_NNP + i] <> kw[i]: flag = 1 break i = i + 1 if flag == 0: if indx_NNP > 0 and indx_NNP < 200: keywords_by_postion.append(kw) print "filtered Keywords:" print keywords_by_postion unique_NNPs_final = keywords_by_postion for ngram in unique_NNPs_final: for i in ngram: if len(i) == 1: ngram.remove(i) if len(ngram) == 0: unique_NNPs_final.remove(ngram) return unique_NNPs_final
def tagged_sents(self): for sent in self.sents(): yield nltk.pos_tag(nltk.wordpunct_tokenize(sent))