def construct_term_doc_matrix(self, pca=False): ''' Constructs a term-document matrix such that td_matrix[document][term] contains the weighting score for the term in the document. ''' if not self.filter_terms: corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()]) else: corpus = nltk.TextCollection(self._filter_terms()) terms = list(set(corpus)) data_rows = numpy.zeros([len(self.document_dict), len(set(corpus))]) for i, document in enumerate(self.document_dict.values()): text = nltk.Text(document.tokens) for item in document.word_frequencies: data_rows[i][terms.index(item.word)] = corpus.tf_idf(item.word, text) #table = Orange.data.Table("iris.tab") self.attributes = terms#table.domain.features #a, c, w = table.to_numpy() self.td_matrix = data_rows#a #If PCA is True then we project our points on their principal components #for dimensionality reduction if pca: t = construct_orange_table(self.attributes, self.td_matrix) self.td_matrix = orange_pca(t) #Attributes names have no meaning after dimensionality reduction self.attributes = [i for i in range(self.td_matrix.shape[1])]
def compute_tf_idf_document_matrix(articles_dict): all_articles = range(len(articles_dict)) for k, v in articles_dict.iteritems(): text = v['content'].lower().split() all_articles[int(k)] = text v['tokenized'] = text #create a TextCollection corpus from all articles #this allows us to perform tf-idf tc = nltk.TextCollection(all_articles) #this is our target - matrix of all tf-idf values for every word and document td_matrix = {} for k, v in articles_dict.iteritems(): post = v['tokenized'] fdist = nltk.FreqDist(post) doc_review_id = v['review_id'] td_matrix[doc_review_id] = {} for term in fdist.iterkeys(): td_matrix[doc_review_id][term] = tc.tf_idf(term, post) return td_matrix
def main(): f_path = '[change to your googleplus_posts.json location]' data = json.loads(open(f_path).read()) QUERY_TERMS = ['mobile'] # You can change the search terms here activities = [ activity['object']['content'].lower().split() for activity in data if activity['object']['content'] != '' ] # nltk TextCollection has tf-idf itself tc = nltk.TextCollection(activities) relevant_activities = [] for i in range(len(activities)): score = 0 for term in QUERY_TERMS: score += tc.tf_idf(term.lower(), activities[i]) if score > 0: relevant_activities.append({ 'score': score, 'title': data[i]['title'], 'url': data[i]['url'] }) relevant_activities = sorted(relevant_activities, key=lambda a: a['score'], reverse=True) for ra in relevant_activities: print 'title: ', ra['title'] print 'url: ', ra['url'] print 'score: ', ra['score']
def construct_term_doc_matrix(self, index, document): ''' Overrides the parent method for constructing a td_matrix. The reason is because we want to construct the matrix based on a sliding window approach. ''' if index < self.window: documents = self.document_dict.values() else: window = (index - self.window + 1, index) documents = self.document_dict.values()[window[0]:window[1]] #Online clustering doesn't support term filtering yet corpus = nltk.TextCollection( [document.tokens for document in documents]) terms = list(set(corpus)) term_vector = numpy.zeros(len(set(corpus))) text = nltk.Text(document.tokens) for item in document.word_frequencies: term_vector[terms.index(item.word)] = corpus.tf_idf( item.word, text) self.attributes = terms self.td_matrix = term_vector
def cluster_texts(texts, clustersNumber, distance): # Convierte texto en una coleccion # Load the list of texts into a TextCollection object. collection = nltk.TextCollection(texts) print("Created a collection of", len(collection), "terms.") # Para representar los textos como vectores de terminos representativos, cojo los terminos unicos # Get a list of unique terms unique_terms = list(set(collection)) print("Unique terms found: ", len(unique_terms)) ### And here we actually call the function and create our array of vectors. # TF mide la frecuencia en los textos. # Mira de los terminos unicos, cuantas veces aparece en el documento. No mira cuantas veces aparece en la coleccion # Hay otras medidas, como TF-IDF que son mas precisas porque tambien miran cuantas veces aparece en la coleccion vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts] print("Vectors created.") print(vectors) # initialize the clusterer clusterer = GAAClusterer(clustersNumber) clusters = clusterer.cluster(vectors, True) # Estas lineas siguientes comentadas es lo mismo pero con otra libreria, la llamada scikit-learn #clusterer = AgglomerativeClustering(n_clusters=clustersNumber, # linkage="average", affinity=distanceFunction) #clusters = clusterer.fit_predict(vectors) return clusters
def render_wordcloud(form, **kwargs): session = Session() results = search.search(session, **form.values()) # Create the corpus from the results tknzr = TweetTokenizer() texts = [] for r in results: tokens = [] for sent in sent_tokenize(r.text.strip()): tokens += [ w for w in tknzr.tokenize(sent.strip()) if w.lower() not in stopwords_en ] texts.append(tokens) corpus = nltk.TextCollection(texts) corpus.collocations(100) # noinspection PyProtectedMember results = { 'vocabulary': [list(i) for i in corpus.vocab().most_common(1000)], 'collocations': corpus._collocations, } view = render_template('./templates/search/results_wordcloud.html', form=form, results=results, **kwargs) session.close() return view
def calculate_results(self): vocab = nltk.TextCollection(self.articles).vocab().items() overall_freqdist = [(fd[0], float(fd[1]) / float(vocab[0][1])) for fd in vocab] for city in self.cities: self.cities[city]["freqdist"] = self.tf_icf(city)[0:100] self.db.save(self.cities[city])
def TF_IDF2(documents, dictionary): print('tf-idf') vectors = [] i = 0 # 重新构造文本集 Texts = [] for document in documents: Text = '' for token in document: if token in dictionary: Text += (' ' + token) Texts.append(Text) print(i) i += 1 # 加载计算tf-idf类库 tc = nltk.TextCollection(Texts) i = 0 for document in Texts: vector = [] for item in dictionary: # 计算tf-idf weight = tc.tf_idf(str(item), document) vector.append(weight) vectors.append(vector) print(i) i += 1 # pd.DataFrame(vectors).to_csv(out, sep=",", header=None, index=None) return vectors
def __init__(self, token_list_list): ''' Initialize. Args: token_list_list: The list of list of tokens. ''' self.__collection = nltk.TextCollection(token_list_list)
def cluster_texts(texts, clustersNumber, distanceFunction, clusterMode): """ Function to cluster several texts. The following inputs must be specified: *) texts: collection of texts to cluster *) clustersNumber: number of clusters to be used *) distanceFunction: distance function to be used by the clustering algorithms *) clusterMode: cluster mode to be used:"AgglomerativeClustering", "KMeans" or "MiniBatchKMeans", all of them belonging to the scikit-learn library """ collection = nltk.TextCollection(texts) # print("Created a collection of", len(collection), "terms.") # Get a list of unique terms unique_terms = list(set(collection)) # print("Unique terms found: ", len(unique_terms)) ### And here we actually call the function and create our array of vectors. # TF mide la frecuencia en los textos. # Mira de los terminos unicos, cuantas veces aparece en el documento. No mira cuantas veces aparece en la coleccion # Hay otras medidas, como TF-IDF que son mas precisas porque tambien miran cuantas veces aparece en la coleccion vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts] # print("Vectors created.") # print(vectors) # for vector in vectors: # print("Vector ", len(vector)) # initialize the clusterer # clusterer = GAAClusterer(clustersNumber) # clusters = clusterer.cluster(vectors, True) # Estas lineas siguientes comentadas es lo mismo pero con otra libreria, la llamada scikit-learn if clusterMode == "AgglomerativeClustering": clusterer = AgglomerativeClustering(n_clusters=clustersNumber, linkage="average", affinity=distanceFunction) clusters = clusterer.fit_predict(vectors) elif clusterMode == "KMeans": clusterer = KMeans(n_clusters=clustersNumber, random_state=0) clusters = clusterer.fit(vectors).predict(vectors) elif clusterMode == "MiniBatchKMeans": clusterer = MiniBatchKMeans(n_clusters=clustersNumber, random_state=0) clusters = clusterer.fit(vectors).predict(vectors) else: print("Invalid cluster mode") return None return clusters
def get_most_frequent_terms(self, N=5): ''' Returns the top N occuring terms in this cluster. ''' if self.top_patterns != None: return self.top_patterns else: corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()]) return nltk.FreqDist(corpus).items()[:N]
def tfidf(doc, docs): """対象の文書と全文の形態素解析した単語リストを指定すると対象の文書のTF-IDFを返す""" tokens = list(chain.from_iterable(docs)) #flatten A = nltk.TextCollection(docs) token_types = set(tokens) return [{ "word": token_type, "tfidf": A.tf_idf(token_type, doc) } for token_type in token_types]
def tf_idf(docs): tokens = [] for doc in docs: tokens += doc tf_idf = {} A = nltk.TextCollection(docs) token_types = set(tokens) for token_type in token_types: #print token_type,'=', A.tf_idf(token_type,tokens) tf_idf[token_type] = A.tf_idf(token_type, tokens) return tf_idf
def get_tf(docid, term, index): if is_phrase_term(term): # if it's a phrase, return error return "Not valid term, can not be term" else: if docid in index._doc_contents: doc = nltk.Text(nltk.word_tokenize(index._doc_contents[docid])) col = nltk.TextCollection([doc]) return col.tf(term, doc) else: return "Not Found"
def _calculate_centroid(self): ''' It calculates the centroid of this collection of documents. ''' corpus = nltk.TextCollection([document.tokens for document in self.documents.values()]) terms = list(set(corpus)) centroid = numpy.zeros([len(self.documents.items()), len(terms)]) for i, document in enumerate(self.documents.values()): centroid[i] = document.fv self.centroid = numpy.mean(centroid, axis=0)
def get_collocations(self, n=2, N=5): ''' Returns the top collocations of the cluster corpus based on Jaccard index. The collocations correspond to n-grams and more specifically we limited the options to bigrams (n=2) and trigrams (n=3) ( n defaults to 2 ). ''' corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()]) finder = nltk.BigramCollocationFinder.from_words(corpus) scorer = nltk.metrics.BigramAssocMeasures.jaccard #finder.apply_freq_filter(3) finder.apply_word_filter(lambda w:w in nltk.corpus.stopwords.words('english')) collocations = finder.nbest(scorer, N)
def _attach_feature_vectors(self): ''' Iterates over the summarizer documents and calculates a tf-idf weighted feature vector for each document. The feature vectors is attached to the document. ''' corpus = nltk.TextCollection([document.tokens for document in self.documents.values()]) terms = list(set(corpus)) for id, document in self.documents.iteritems(): text = nltk.Text(document.tokens) fv = numpy.zeros([len(set(corpus))]) for item in document.word_frequencies: fv[terms.index(item.word)] = corpus.tf_idf(item.word, text) self.documents[id].fv = fv
def convertToTexts(): print("Converting clean files to text collection...") textList = [] for filename in os.listdir(os.getcwd()): if "c_" in filename: file = open(filename, 'r', encoding='utf-8') text = file.read().lower() text = re.sub('[^\w\s]', ' ', text) tokens = nltk.word_tokenize(text) tokens = remove_stopwords(tokens) text = nltk.Text(tokens) textList.append(text) file.close() print("Finished converting clean files to Text collection") return [nltk.TextCollection(textList), textList]
def tfidf(word): collection = nltk.TextCollection(word) doc = [] for do in word: wo = [] for term in set(do): a = collection.tf_idf(term, do) if a > 0: wo.append([term, a]) wo.sort(key=lambda x: x[1]) wo.reverse() slice1 = [i[0] for i in wo] lists = slice1[:20] doc.append(list(lists)) return doc
def tf_idf(sentence, resources): result = [] filename = resources["corpus"] file = open(filename) data = file.read() file.close() print("Finished reading file....") #data = data.decode("utf-8") line = data.split("\n") # 与えられた文章を形態素解析 mt = MeCab.Tagger(dic_path) mt.parse('') res = mt.parseToNode(sentence) elements = [] while res: ft = res.feature.split(",") #elements.append(res.surface.decode("utf-8")) elements.append(res.surface) #print res.surface, res.feature res = res.next print("Finished morphological analysis....") elements = elements[1:-1] docs = [] docs.append(elements) for l in line: docs.append(l.split(" ")) print("Finished spliting word....") collection = nltk.TextCollection(docs) uniqTerms = list(set(collection)) for term in elements: #print("%s : %f" % (term, collection.tf_idf(term, elements))) result.append((term.encode("utf-8"), collection.tf_idf(term, elements))) result = sorted(result, reverse=True, key=lambda x: float(x[1])) return result
def TFIDF(document): dokumen = '' kum_kata = set() for dokumen in document: kum_kata = kum_kata.union(set( dokumen.split(' '))) #proses penggabungan kum_kata = sorted(kum_kata) collection = nltk.TextCollection( kum_kata) #mengurutkan kumpulan kata berdasarkan abjad unique_terms = list(collection) #print list(collection) word_tfidf = [] for word in unique_terms: word_tfidf.append(collection.tf_idf(word, document)) # file = open("TF_IDF.txt", "wb") # file.write("%s " %kum_kata + "%s\n" %word_tfidf) # file.close() return word_tfidf
def question_match_tf_idf(data_question1, data_question2): """Calculate the match rate between two questions based on TF_IDF""" # Calculate IDF question_corpus = [] question_corpus.extend(data_question1.tolist()) question_corpus.extend(data_question2.tolist()) text_collection = nltk.TextCollection(question_corpus) weights = { word: text_collection.idf(word) for word in text_collection.tokens } # Calculate the match rate result = [] for question1, question2 in zip(data_question1, data_question2): result.append(match_rate_tf_idf(question1, question2, weights)) return result
def create_index(self, documentos): listaTextos = [] for d in documentos: listaTextos.append( nltk.wordpunct_tokenize( nltk.clean_html(d.texto.encode('utf-8')))) for d in documentos: tokens = nltk.wordpunct_tokenize(nltk.clean_html(d.texto)) tokens = [token.lower() for token in tokens] frequencency = nltk.FreqDist(tokens) for i in frequencency.items(): termo = self.remove_punctuation(i[0]) if len(termo) > 0: tc = nltk.TextCollection(listaTextos) tf_idf = tc.tf_idf(termo, d.texto) achou = False index = 0 for c in self.contents: index += 1 if c.termo == termo: achou = True break content = Content() content.termo = termo if not achou: content.urls.append(url=d.url, tf_idf=tf_idf, frequencia=i[1]) self.contents.append(content) else: try: self.contents[index].urls.append(url=d.url, tf_idf=tf_idf, frequencia=i[1]) except: print 'Nao foi possivel adicionar um termo' '''chave = KeyValue(i[0],d.url,tf_idf) if self.hashTable.lookup(chave): self.hashTable.append(chave) else: self.hashTable.add(chave)''' return self.contents
def load_possible_terms(self, np_text_list): """ Retrieve possible words/terms from numpy list of text Args: np_text_list(np(list(string))): Numpy list containing text which term to be extracted """ temp_word_list = np.array([]) for text in np_text_list: text = StringManipulator.normalize_text(text) temp_word_list = np.append( temp_word_list, StringManipulator.retrieve_unique_words(text)) self.word_list = np.append(self.word_list, temp_word_list) self.word_list = np.unique(self.word_list) self.text_collection = nltk.TextCollection(self.word_list)
def cluster_texts(texts, clustersNumber, distance): #Load the list of texts into a TextCollection object. collection = nltk.TextCollection(texts) print("Created a collection of", len(collection), "terms.") #get a list of unique terms unique_terms = list(set(collection)) print("Unique terms found: ", len(unique_terms)) ### And here we actually call the function and create our array of vectors. vectors = [numpy.array(TF(f,unique_terms, collection)) for f in texts] print("Vectors created.") # initialize the clusterer clusterer = AgglomerativeClustering(n_clusters=clustersNumber, linkage="average", affinity=distanceFunction) # esto se deja as clusters = clusterer.fit_predict(vectors) # que este predict sea parecido a reference return clusters
def getRelevantNews(self): # Определите здесь свой запрос QUERY_TERMS = ['стол', 'кубка', 'регион'] # получаем массив новостей self.news = self.getNews() # Textcollection определяет абстракции tf, idf и tf_idf, # поэтому нам не требуется определять свои версии tc = nltk.TextCollection(self.news) relevant = [] for idx in range(len(self.news)): score = 1 for term in [t.lower() for t in QUERY_TERMS]: score += tc.tf_idf(term, self.news[idx]) if score > 0: relevant.append({'score': score, 'title': self.news[idx]}) # Сортировать результаты по релевантности и выводим relevants = sorted(relevant, key=lambda p: p['score'], reverse=True) for post in relevants: print('{0}'.format(post['title'])) return relevants
def cluster_texts(texts, cluster_number, distance, verbose=True, measure=TF): #Load the list of texts into a TextCollection object. collection = nltk.TextCollection(texts) #get a list of unique terms unique_terms = list(set(collection)) if verbose: print("Creando collecion de %d terminos" % len(collection)) print("Terminos unicos encontrados: ", len(unique_terms)) ### And here we actually call the function and create our array of vectors. vectors = [numpy.array(measure(f,unique_terms, collection)) for f in texts] # initialize the clusterer clusterer = AgglomerativeClustering(n_clusters=cluster_number, linkage="average", affinity='cosine') clusters = clusterer.fit_predict(vectors) return clusters
def getTDMatrix(textCorpus): all_articles = [article['text'].lower().split() for article in textCorpus] tc = nltk.TextCollection(all_articles) # Compute a term-document matrix such that td_matrix[doc_title][term] # returns a tf-idf score for the term in the document td_matrix = {} i = 0 for idx in range(len(all_articles)): i += 1 print i article = all_articles[idx] fdist = nltk.FreqDist(article) doc_title = textCorpus[idx]['author'] td_matrix[doc_title] = {} # takes long.. for term in fdist.iterkeys(): td_matrix[doc_title][term] = tc.tf_idf(term, article) return td_matrix
def cluster_texts(texts, clustersNumber, distance): #Load the list of texts into a TextCollection object. collection = nltk.TextCollection(texts) print("Created a collection of {0}, terms.".format(len(collection))) #get a list of unique terms unique_terms = list(set(collection)) print("Unique terms found: ", len(unique_terms)) ### And here we actually call the function and create our array of vectors. vectors_tf_idf = [ numpy.array(TF_IDF(f, unique_terms, collection)) for f in texts ] vectors_idf = [ numpy.array(IDF(f, unique_terms, collection)) for f in texts ] print("Vectors created.") # initialize the clusterer cluster = AgglomerativeClustering(n_clusters=clustersNumber, linkage="average", affinity=distance) clusters_tfidf = cluster.fit_predict(vectors_tf_idf) clusters_idf = cluster.fit_predict(vectors_idf) return (clusters_tfidf, clusters_idf)
def cluster_texts(texts, clustersNumber, distance): #Load the list of texts into a TextCollection object. collection = nltk.TextCollection(texts) print("Created a collection of", len(collection), "terms.") # Get a list of unique terms unique_terms = list(set(collection)) print("Unique terms found: ", len(unique_terms)) # And here we actually call the function and create our array of vectors. vectors = [ numpy.array(TF_IDF(f, unique_terms, collection)) for f in texts ] # NUEVO print("Vectors created.") # Initialize the clusterer -> classify the words into groups clusterer = AgglomerativeClustering(n_clusters=clustersNumber, linkage="average", affinity=distanceFunction) clusters = clusterer.fit_predict(vectors) return clusters