def build_text_collections(): text_collections = {} sample_size = 4 for category in ["news", "learned", "fiction"]: texts = [] for fileid in nltk.corpus.brown.fileids( categories=category)[:sample_size]: texts.append(tokenize(nltk.corpus.brown.raw(fileid))) text_collections[category] = TextCollection(texts) text_collections["all"] = TextCollection(text_collections.values()) return text_collections
def compute_tfidf(text, filename): numPara = len(text) print "there should be this many para in the text file ", numPara colList = [] paragraphWords = [] for i in range(numPara): paragraphWords = word_tokenize(text[i]) colList.append(paragraphWords) collection = TextCollection(colList) for paraList in colList: dict = {} for term in paraList: print term, "has weight: ", collection.tf_idf(term, paraList) dict[term] = collection.tf_idf(term, paraList) ''' print "BEFORE <><><><><<><<>><><><><><><><>><>< ",type(dict) for key,value in dict.iteritems(): print key," ",value ''' d = sortDict(dict) print "AFTER SORTED <><><><><<><<>><><><><><><><>><>< ", type(d) textFile = open(filename, "a") textFile.write("\n") for key, value in d: s = str(key) + "\t" + str(value) + "\n" textFile.write(s)
def get_tf_idf_dict_nltk( self, column_type="review_body", save_path="tf_idf_value/hair_dryer_tf_idf_dict.csv"): ''' ### nltk version it's super slow so don't use it ''' reviews = self.raw_df[column_type].tolist() # get clean header reviews_list_cleaned = clean_tsv(reviews) # get all words words = set() for reviews in reviews_list_cleaned: for review in reviews: words.add(review) words = list(words) corpus = TextCollection(reviews_list_cleaned) tf_idf = [] for word in words: tf_idf.append(corpus.tf_idf(word, corpus)) df = pd.DataFrame({"word": words, "tf-idf": tf_idf}) df.to_csv(save_path, encoding='utf-8')
def getEmmaChapter(): from nltk.text import TextCollection # from nltk.text import * import nltk # from nltk.book import text1, text2, text3 gutenberg = TextCollection(nltk.corpus.gutenberg) # ----- IDF EXAMPLE ---- # print(gutenberg.idf('Dick')) # ----- IDF EXAMPLE ---- i = 2 # line 2 to line 166 is chapter 1 emma = nltk.corpus.gutenberg.sents('austen-emma.txt') # for l in emma: chapterText = '' while i < 167: # print(str(i) + ': ') k = 0 l = emma[i] line = '' for w in l: line += l[k] + ' ' k = k + 1 # print(str(i) + ': ' + line + '\n') chapterText += line + '\n' i = i + 1 print (chapterText) return
def nltk_tf_idf(corpus_one, file_name): print('-----starting nltk_tf_idf') corpus_one = [nltk.word_tokenize(doc) for doc in corpus_one] texts = TextCollection(corpus_one) for doc in corpus_one: yield {term: texts.tf_idf(term, doc) for term in doc}
def main(): # Get text from folder or file # TODO Change the folder corpus to the upper level! texts = load_text_data(config_path) if not texts: print "No texts found" return # Dictionary that will hold all the ngrams and their values, for each measure (dict of dicts) scored_ngrams = {} # Create a list of Document objects with the texts. Pretreat them also. list_documents = [] for label, text in texts.iteritems(): list_documents.append(Document(text, stem=config_stem, name=label)) # list_documents = TextCollection([Document(text, stem=config_stem, name=label) # for label, text in texts.items()][:]) list_documents = TextCollection(list_documents) global config_ngram if config_ngram == 0: config_ngram = 1 #########################################N GRAM EXTRACTION ################################################# # Now do the ngram extraction for ng in range(2, config_ngram + 1): ngrams = get_any_ngrams(list_documents, ngram=ng, k=config_top_k, min_tok_len=config_min_tok_len, min_freq=config_min_tok_freq) scored_ngrams = update_dict_values(scored_ngrams, ngrams) scored_ngrams = update_dict_values(scored_ngrams, get_concordances(list_documents, scored_ngrams)) make_tables(scored_ngrams, results_folder=config_output) return
def ranking(reuters, corpus, docids, palavras): '''Cria um ranqueamento entre os textos da busca, sendo o primeiro o mais relevante Args: reuters: corpus vindo do nltk corpus: dicionário contendo a relação entre índice e texto docids: índices dos textos buscados palavras: palavras tokenizadas da query Returns: Lista com todas os índices já ranqueados ''' rank = {} tc = TextCollection(reuters) for e in docids: rank[e] = 0 for i in palavras: rank[e] += tc.tf_idf(i, corpus[e]) rank = { k: v for k, v in reversed(sorted(rank.items(), key=lambda item: item[1])) } return rank.keys()
def vectorize_t(corpus): #corpus = [tokenize(doc) for doc in corpus] texts = TextCollection(corpus) return { term: texts.tf_idf(term, corpus) for term in corpus }
def sentenceAlignment(simpleParas, normalParas, pairedPara): for key,value in pairedPara.items(): # key is simple and value in normal print "**********************************" print "PARAGRAPH" print "##################################" SPara = simpleParas[key] NPara = normalParas[value] # given two paragraphs, it returns a list of all the sentences where each sentence is a list of words, with a list of simple sentence list and normal sentence list colList, sslist,nslist = formSentenceList(SPara,NPara) collection = TextCollection(colList) dict={} for sentence in colList: weight = 0 for term in sentence: weight = collection.tf_idf(term,sentence) print "TERM -> ",term, "is",weight # what if the term is already in the dic, we need to add the weight if(term not in dict): dict[term] = weight # dict[term] = weight #dict = sortDict(dict) print "================================================================" '''
def getTopic2(text): # clean input stop = open('stopwords.txt').read() l = [] src = [ w.strip(" .,?!") for w in nltk.word_tokenize(text.lower()) if w not in stop ] candidates = nltk.FreqDist(w for w in src if len(w) > 3) candidates = candidates.keys()[:10] # initialize vectors brown = TextCollection(nltk.corpus.brown) for w in candidates: l.append((w, brown.tf_idf(w, candidates))) vectors = [array(l)] # initialize the clusterer clusterer = nltk.cluster.kmeans.KMeansClusterer(10, euclidean_distance) clusterer.cluster(vectors, True) #pick the one closest to the center of the largest clusterer.Means().Max() o = [l for l in clusterer.Means()] #o = [(clusterer.classify(l.index(i)), l.index(i)) for i in range(len(l))] o.reverse() print o.pop().index(1)
def __vectorize(self, corpus): corpus = [list(self.__tokenize(doc)) for doc in corpus] texts = TextCollection(corpus) for doc in corpus: yield {term: texts.tf_idf(term, doc) for term in doc}
def nltk_tfidf_vectorize(corpus): from nltk.text import TextCollection corpus = [list(tokenize(doc)) for doc in corpus] texts = TextCollection(corpus) for doc in corpus: yield {term: texts.tf_idf(term, doc) for term in doc}
def tf_idf(self): corpus = [ list(self.cr.tokenize_strip_punct(desc)) for desc in self.cr.texts() ] texts = TextCollection(corpus) for desc in corpus: yield {term: texts.tf_idf(term, desc) for term in desc}
def vectorize(corpus): corpus_tokenized = [list(tokenize(doc)) for doc in corpus] texts = TextCollection(corpus_tokenized) for doc in corpus_tokenized: return { term: texts.tf_idf(term, doc) for term in doc }
def tf_idf_vectorize_nltk(corpus): print(corpus) #corpus = [tokenize(doc) for doc in corpus] texts = TextCollection(corpus) print(texts) for doc in corpus: yield { term: texts.tf_idf(term, doc) for term in doc }
def tfidf_extraction(self, subset=None): if subset is not None: data = self.data[subset] else: data = self.data get_idf = TextCollection(data.Tokenize.to_list()) word_list = list(set([w for l in data.Tokenize.to_list() for w in l])) full_winfo = [[word, idf, tag[1]] for word, idf, tag in zip(word_list, [get_idf.idf(i) for i in word_list], nltk.pos_tag(word_list))] self.keywords = pd.DataFrame([i for i in full_winfo if i[2] in ["JJ", "NNP", "VBP", 'VBG', 'VBD', 'VBN', 'CD', 'NN', 'NNPS', 'RB', 'IN'] and not is_number(i[0])], columns=["word", "idf", "tag"]).sort_values(by="idf", ascending=True).reset_index(drop=False) self.full_words = pd.DataFrame(full_winfo, columns=["word", "idf", "tag"]).sort_values(by="idf", ascending=True).reset_index(drop=False) self.enable_topk == True
def train(self, trainfile=None): print "training WeightedTweetClassifier" self.readTrainingData((trainfile or self.trainfile)) for tweet in self.trainingTweets: # lowercase, remove punctuation nopunct = string.lower( tweet.tweet.translate(string.maketrans("", ""), string.punctuation)) tweet.tweet = nopunct # add all Tweets to our TextCollection. This automatically creates a TF-IDF model self.textCollection = TextCollection( [tweet.tweet for tweet in self.trainingTweets])
def __init__(self, pairs, mode='eng', stopwords_flag=True): self.pair_dict = {} self.ids = [pair[0] for pair in pairs] self.tfidfs = [] self.mode = mode self.stopwords_flag = stopwords_flag docs = [pair[1] for pair in pairs] self.docs = [self.preprocess(doc) for doc in docs] for id, text in zip(self.ids, self.docs): self.pair_dict[id] = text self.corpus = TextCollection(self.docs) self.query = []
def run_main(): text1 = 'I like the movie so much ' text2 = 'That is a good movie ' text3 = 'This is a great one ' text4 = 'That is a really bad movie ' text5 = 'This is a terrible movie' tf_analy = TextCollection([text1, text2, text3, text4, text5]) new_text = 'That one is a good movie. This is so good!' word = 'That' tf_idf_val = tf_analy.tf_idf(word, new_text) print(tf_idf_val)
def __init__(self, doc: str, vec_size: int, alpha=0.06): @has_vec_set(doc) def get_vec_set(doc_vec): res = {} cur_line = 0 # tc = TextCollection(self.doc) while 1: try: cur_words = tf_idf_sort(doc_vec.doc, doc_vec.tc, cur_line) for w, v in cur_words: if w in res: res[w] = max(res[w], v) else: res[w] = v except IndexError: break cur_line += 1 print("{} \r".format(cur_line), end='') return res self.doc = read_comments(doc) self.tc = TextCollection(self.doc) self.vec_set = get_vec_set(self) self.vec_set = [(w, self.vec_set[w]) for w in self.vec_set] self.vec_set = DataFrame(self.vec_set) Max = self.vec_set[1].max() Min = self.vec_set[1].min() self.vec_set[1] = self.vec_set[1].apply(lambda x: (x - Min) / (Max - Min)) self.vec_set[1] = self.vec_set[1].apply(lambda x: x * (1 - alpha)) self.vec_set = zip(self.vec_set[0], self.vec_set[1]) self.vec_set = {w: v for w, v in self.vec_set} G = Graph(doc, True) tex_rank_key_word = DataFrame(key_word(G, 10, 5000)) Min = tex_rank_key_word[1].min() Max = tex_rank_key_word[1].max() tex_rank_key_word[1] = tex_rank_key_word[1].apply( lambda x: alpha * (x - Min) / (Max - Min)) tex_rank_key_word = list( zip(tex_rank_key_word[0], tex_rank_key_word[1])) self.vec_set = [(w, self.vec_set[w]) for w, v in tex_rank_key_word if self.vec_set[w] >= alpha] # for w, v in tex_rank_key_word: # if w in self.vec_set: # self.vec_set[w] += v # else: # self.vec_set[w] = v # self.vec_set = sorted([(w, self.vec_set[w]) for w in self.vec_set], key=lambda x: x[1], reverse=True) self.vec_set = sorted(self.vec_set, key=lambda x: x[1], reverse=True) print(len(self.vec_set)) self.vec_size = vec_size
def Generate_keyword(obj,length): orig_file = './Data/'+obj+'/'+obj+'.xlsx' data = xlrd.open_workbook(filename=orig_file) sheet = data.sheet_by_index(1) review_head = np.array(sheet.col_values(12))[1:] review_body = np.array(sheet.col_values(13))[1:] review_all=[] for i in range(length) : review = review_head[i] + " " +review_body[i] review_all.append(review) review_all = np.array(review_all) # make review tokens tokens=[] for i,review in enumerate(review_all): review = review.lower() replacer = RegexpReplacer() review = replacer.replace(review) remove = str.maketrans('','',string.punctuation) review = review.translate(remove) token = nltk.word_tokenize(review) token = [w for w in token if w == 'not' or not w in stopwords.words('english')] s = nltk.stem.SnowballStemmer('english') token = [s.stem(ws) for ws in token] tokens.append(token) token_file = './Data/'+ obj +'/tokens.pkl' f=open(token_file,'wb') pickle.dump(tokens,f) f.close() corpus=TextCollection(tokens) tf={} tf_idf={} for review in tokens: for word in review: if word not in tf : tf_=corpus.tf(word,corpus) tf[word]=tf_ if word not in tf_idf : tf_idf_=corpus.tf_idf(word,corpus) tf_idf[word] = tf_idf_ tf_sorted = sorted(tf.items(), key=lambda item:item[1], reverse=True) tf_idf_sorted = sorted(tf_idf.items(), key=lambda item:item[1], reverse=True) pd.DataFrame(tf_sorted).to_csv('./Data/'+obj+'/tf_sorted.csv') pd.DataFrame(tf_idf_sorted).to_csv('./Data/'+obj+'/tf_idf_sorted.csv')
def computeTFIDF_text(texts, singletext): #texts是句子字符串列表(语料库),singletext单个句子字符串, texts = [nltk.word_tokenize(text) for text in texts] #对句子列表所有句子分词 corpus = TextCollection(texts) words = nltk.word_tokenize(singletext) #单词列表 tfidf_words = {} #计算机tfidf for word in words: idf = corpus.idf(word) #tf tf = corpus.tf(word, words) #idf tfidf = idf * tf tfidf_words[word] = tfidf return tfidf_words
def calculate_idf(words, corpus): """ Calulate the idf of words by using a corpus :param words: The words to calculate their idf :param corpus: The corpus to use in calculation :return: dict of {word: idf} """ words = set(words) # print("Loading corpus to calculate idf...") corpus_colleciton = TextCollection(corpus) idfs = {} for word in words: idfs[word] = corpus_colleciton.idf(word) return idfs
def getTextCollectionFromTxtFile(fn): ''' Create text collection from external text files Input: fn - name of the external text file Output: textCollection containing all texts in the given file ''' f = open(fn, 'rU') tc = [] alltokens = [] for line in f: text, tokens = getTextFromString(line) tc.append(text) alltokens.extend(tokens) return TextCollection(tc), alltokens
def attrexplore(corpus): # s = "in douglas r. stinson, editor,.proc. crypto 93,.lecture notes in computer science no. 773..pages 278-291..1994..avrim blum, merrick furst, michael kearns, and richard j. lipton..springer,.cryptographic primitives based on hard learning problems.." # ss = SenToken(raw=s) # print(ss) # for sent in ss: # print(sent) nltkCorpus = TextCollection(corpus) print(nltkCorpus.idf(term='this')) print(idf(term='this', corpus=corpus)) print(nltkCorpus.tf(term='this', text='this is sentence four')) print(tf_idf(term='this', doc='this is sentence four', corpus=corpus)) fdist = nltk.FreqDist(WordTokener(sent=corpus[0])) print(fdist.tabulate())
def compute_features(blog): features = [] text_collection = [] for doc in blog.docs: text_collection.append(' '.join(doc)) text_collection = TextCollection(text_collection) # 为了方便计算tf_idf pageranks = PageRank(blog) # 得到每句话的pagerank分数 for i, doc in enumerate(blog.docs): for j, sent in enumerate(doc): cur_feat = [] cur_feat.extend(surface(blog, i, j, sent)) cur_feat.extend(content(blog, i, j, sent, text_collection)) cur_feat.extend(rel(blog, i, j, sent)) cur_feat.append(pageranks[i][j]) features.append(cur_feat) features = normalize(features) return features
def text_classification(): """ 文本分类 :return: """ text1 = 'I like the movie so much ' text2 = 'That is a good movie ' text3 = 'This is a great one ' text4 = 'That is a really bad movie ' text5 = 'This is a terrible movie' # 构建TextCollection对象 tc = TextCollection([text1, text2, text3, text4, text5]) new_text = 'That one is a good movie. This is so good!' word = 'That' tf_idf_val = tc.tf_idf(word, new_text) print('{}的TF-IDF值为:{}'.format(word, tf_idf_val))
def retrieve_results(n_percentile): search_queries = parse_trec('documents/irg_queries.trec') search_collections = parse_trec('documents/irg_collection_clean.trec') # search_collections = parse_trec('documents/irg_collection_short.trec') # search_collections = eliminate_stopwords(search_collections) # write_collection_doc(search_collections, 'documents/irg_collection_clean.trec') print('======= Statistics =======') print(f'Queries: {len(search_queries)}') print(f'Collections: {len(search_collections)}') print(f'Removal of {int((1-n_percentile)*100)}%-ile') print('==========================') # TF-IDF document_results = [] for search_query_id, search_query_text in search_queries.items(): print( f'Current query id: {search_query_id}, text: "{search_query_text}"' ) terms = search_query_text.split(' ') documents = keep_n_percentile_most_relevant_words(search_collections, search_query_text, n=n_percentile) document_scores = {} search_texts_collection = TextCollection(documents.values()) for document_id, document_text in documents.items(): for term in terms: current_score = document_scores.get(document_id, 0.0) document_scores[ document_id] = current_score + search_texts_collection.tf_idf( term, document_text) rank = 1 for document_id, document_scores in sorted(document_scores.items(), key=lambda kv: kv[1], reverse=True): if rank <= 1000: document_results.append( Result(search_query_id, document_id, rank, document_scores)) rank += 1 result_writer(document_results, f'IE_result_keep_{int(n_percentile*100)}_percentile.trec') print('Done')
def calc_tf_idfs(count): """loops through archived wordlists, loads each, calculates TF-IDF score for words contained, writes to dict and saves in pickle. """ corpus = TextCollection(nltk.corpus.webtext) filepath = '/home/jrwalk/python/empath/data/reddit/pickles/' files = glob.glob(filepath + 'wordcount*%s.pkl' % count) filecount = len(files) for i, picklefile in enumerate(files): print "%i/%i processing %s" % (i + 1, filecount, picklefile) with open(picklefile, 'r') as readfile: freqdist = pickle.load(readfile)[2] wordscores = tf_idf(freqdist, corpus) druglim = re.findall('[a-z]+_[0-9]+|all|antidepressant', picklefile)[0] writepath = filepath + 'tfidf_' + druglim + '.pkl' with open(writepath, 'w') as writefile: pickle.dump(wordscores, writefile)
def preprocess(self, text): #text = text.split(" "); text = word_tokenize(text) if self.display: print "After Tokenizing" print text print "\n\n" text = [ w.strip().lower() for w in text if not w.strip() in ENGLISH_STOPWORDS and len(w.strip()) > 2 ] tc = TextCollection([text]) words = list(set(tc)) word_tf = {word: tc.tf(word, text) * len(text) for word in words} return word_tf