def termdocumentmatrix(train_path, test_path, cnn=True): tdm = textmining.TermDocumentMatrix() tdm, train_labels = add_doc(tdm, train_path) tdm, test_labels = add_doc(tdm, test_path) tdm_rows = [x for x in tdm.rows()] words = tdm_rows[0] word_index_dict = {} for i in range(0, len(words)): word_index_dict[words[i]] = i train_len, tdm_rows = add_senti_score(train_path, tdm_rows, word_index_dict) test_len, tdm_rows = add_senti_score(test_path, tdm_rows, word_index_dict) train_tdm = np.asarray(tdm_rows[1:train_len + 1]) test_tdm = np.asarray(tdm_rows[train_len + 1:train_len + test_len + 1]) train_labels = np.asarray(train_labels) if cnn: train_tdm = reshapeX(train_tdm) test_tdm = reshapeX(test_tdm) train_labels = reshapeY(train_labels) test_labels = np.asarray(test_labels) return train_labels, train_tdm, test_labels, test_tdm, words
def term_document_matrix(): num_lines = 0 for line in fileinput: num_lines = num_lines + 1 reading_file_info = [item.rstrip('\n') for item in fileinput] tdm = textmining.TermDocumentMatrix() for i in range(0, num_lines): tdm.add_doc(reading_file_info[i]) tdm.write_csv('TermDocumentMatrix.csv', cutoff=1) temp = list(tdm.rows(cutoff=1)) vocab = tuple(temp[0]) x = np.array(temp[1:]) mu = random((num_lines, 3)) fcm = p.FuzzyCMeans(x, mu, 2) print fcm.mu model = lda.LDA(n_topics=15, n_iter=50, random_state=1) model.fit(x) topic_word = model.topic_word_ n_top_words = 10 for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1] fileoutput.write('Topic {}: {}\n'.format(i, ' '.join(topic_words))) fileoutput.close()
def termdocumentmatrix_example(): path = "/Users/franciscojavierarceo/MyPrograms/Python/" os.chdir(path) # Create some very short sample documents doc1 = 'John and Bob are brothers.' doc2 = 'John went to the store. The store was closed.' doc3 = 'Bob went to the store too.' # Initialize class to create term-document matrix tdm = textmining.TermDocumentMatrix() # Add the documents tdm.add_doc(doc1) tdm.add_doc(doc2) tdm.add_doc(doc3) # Write out the matrix to a csv file. Note that setting cutoff=1 means # that words which appear in 1 or more documents will be included in # the output (i.e. every word will appear in the output). The default # for cutoff is 2, since we usually aren't interested in words which # appear in a single document. For this example we want to see all # words however, hence cutoff=1. tdm.write_csv('matrix.csv', cutoff=1) print tdm # Instead of writing out the matrix you can also access its rows directly. # Let's print them to the screen. for row in tdm.rows(cutoff=1): print row
def termdoc(dicto): docs = [] cuisines = ['cuisine'] # Add data to the TDM and remove unicode from cuisine names for entry in dicto: docs.append(dicto[entry]) cuisines.append(entry.encode('ascii', 'ignore')) # Use of textmining library to obtain TDM tdm = textmining.TermDocumentMatrix() for doc in docs: tdm.add_doc(doc) matrix_file = 'matrix.csv' # Remove 'matrix.csv' if it already exists try: os.remove(matrix_file) except OSError: pass # Write frequencies of all ingredients in each cuisine for row, cuisine in zip(tdm.rows(cutoff=1), cuisines): with open(matrix_file, 'ab') as csvfile: writer = csv.writer(csvfile) writer.writerow([cuisine] + row)
def __GetDataSet__(self): docTermMatrix = [] vocab = [] docIds = [] tdm = textmining.TermDocumentMatrix() for i in range(len(self._docId_comma_documentsText_tuple_list)): print i, " extraction.. " docId, documentText = self._docId_comma_documentsText_tuple_list[i] tdm.add_doc(documentText.lower()) # docIds.append(docId) i = -1 for row in tdm.rows(cutoff=3): row = row[0:18000] i += 1 # First row of 'document-term matrix' is vocabulary if i == 0: for vocabWord in row: vocab.append(vocabWord) continue print i, "Loading from tdm matrix.." # Remaiing rows are [] docTermMatrix.append(row) print "Converting doc-term matrix to numpy array..." return np.array(docTermMatrix), vocab, docIds
def cosine_similarity(list_of_file_paths): # Create some very short sample documents doc_list = [ filehandler.convert_to_txt(file_path) for file_path in list_of_file_paths ] # Initialize class to create term-document matrix tdm = textmining.TermDocumentMatrix( tokenizer=simple_tokenize_remove_our_stopwords) for doc in doc_list: tdm.add_doc(doc) results = [] is_first_row1 = True for row1 in tdm.rows(cutoff=1): if is_first_row1: is_first_row1 = False continue is_first_row2 = True cols = [] for row2 in tdm.rows(cutoff=1): if is_first_row2: is_first_row2 = False continue cols.append(1 - spatial.distance.cosine(row1, row2)) results.append(cols) return results
def termdocumentmatrix_example(): tdm = textmining.TermDocumentMatrix() for i in range(0, len(vec_dictionary)): tdm.add_doc(vec_dictionary[i][1]) for row in tdm.rows(cutoff=50): D.append(row)
def termdocumentmatrix_example(): # Create some very short sample documents tdm = textmining.TermDocumentMatrix() mypath = "./corpus" onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] documents = open('documents.csv', 'w') writer = csv.writer(documents) writer.writerow(('document_name', 'content')) pattern = r"(cid)+" for f in onlyfiles: if (f[-4:] == ".txt"): doc = open(mypath + "/" + f, 'r') txt = doc.read().replace(',', '') txt = ' '.join(txt.split()) re.sub(pattern, "", txt) writer.writerow((f, txt)) tdm.add_doc(txt) doc.close documents.close() # Initialize class to create term-document matrix # Add the documents #tdm.add_doc(doc1) #tdm.add_doc(doc2) #tdm.add_doc(doc3) # Write out the matrix to a csv file. Note that setting cutoff=1 means # that words which appear in 1 or more documents will be included in # the output (i.e. every word will appear in the output). The default # for cutoff is 2, since we usually aren't interested in words which # appear in a single document. For this example we want to see all # words however, hence cutoff=1. tdm.write_csv('matrix.csv', cutoff=2)
def term_document_matrix(): num_lines = 0 for line in fileinput: num_lines = num_lines + 1 #calculate the number of lines a text document reading_file_info = [item.rstrip('\n') for item in fileinput] tdm = textmining.TermDocumentMatrix( ) # creation of the list tdm for document matrix for i in range(0, num_lines): tdm.add_doc( reading_file_info[i] ) # Add data to the matrix line by line tokenize is done by itself tdm.write_csv( 'TermDocumentMatrix.csv', cutoff=1 ) # csv document term matrix created by TermDocumentMatrix name temp = list( tdm.rows(cutoff=1)) #temp has all the rows of the document term matrix vocab = tuple(temp[0]) # the row which have the each word of the document x = np.array( temp[1:] ) # starting from the second row of a matrix as initial is only the words # cluster creation mu = random( (num_lines, 6) ) # generate the random number for cluster according to the number of data inserted in document matrix and 3 is the no of clusters should be created which can change fcm = p.FuzzyCMeans(x, mu, 2) # create the clusters num_arra = fcm.mu summation = num_arra.sum( axis=1 ) # calculate the sum of each row of the document matrix as a numpy array summation_vertical = summation[:, None] #make the horizontal sum array to vertical array which is easy for furtur access rows = num_arra.shape[ 0] # give the number of rows of a n-dimension numpy array columns = num_arra.shape[ 1] # give the number of columns of a n-dimension numpy array num_arra = num_arra.astype( float) # change the int array to float to store the float value for rows_count in range(0, rows): # run till the number of arrays divide_sum = summation_vertical.item( rows_count, 0) # give the item which divide the element for normailzation for i in range(0, columns): # run till no of columns in a n-d array replace_division = num_arra.item( rows_count, i) / divide_sum #normalixe the existing value num_arra[ rows_count, i] = replace_division # replace the new value with exisitng value print num_arra #give cluster array whose sum equal to 1 always print num_arra.sum(axis=1) # LDA implimentation model = lda.LDA(n_topics=2, n_iter=10, random_state=2) model.fit(x) topic_word = model.topic_word_ n_top_words = 11 for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1] fileoutput.write('Topic {}: {}\n'.format(i, ' '.join(topic_words))) fileoutput.close()
def summarize(text): # prepare text lines = text.split('.') clean_lines = [line.strip() for line in lines if line.strip()] newtext = '\n'.join(clean_lines) tdm = textmining.TermDocumentMatrix() tdm.add_doc(newtext) for index, row in enumerate(tdm.rows(cutoff=1)): if index == 0: words = row if index == 1: count = row # filter stop words text = open('stopwords.txt').read() stopwords = textmining.simple_tokenize(text) freq = [(w, count[index]) for index, w in enumerate(words) if w not in stopwords] freq.sort(reverse=True) # Concordance most_freq_words = freq[:10] summary = [] h = histogram(lines, most_freq_words) rowcount = threshold(h) summary = [(index, line) for index, line in enumerate(lines) if index < rowcount] summary.sort() ret = [line[1] for line in summary] print '.'.join(ret) return ret
def test_tdm_df(self): tdm = txm.TermDocumentMatrix() for doc in self.doclist: tdm.add_doc(doc) l = [r for r in tdm.rows(cutoff=1)] df = pd.DataFrame(np.array(l[1:]), columns=l[0]) result = tdm_df(self.doclist, remove_punctuation=False) assert_frame_equal(result, df)
def createTermDocM(C, name): tdm = textmining.TermDocumentMatrix() [tdm.add_doc(C[x]) for x in C] tdm.write_csv('%s .csv' % name) names = [x.encode('utf-8') for x in C] f = open("authnames.txt", 'w') f.writelines([name + '\n' for name in names]) f.close() return (tdm)
def main(): # This file should only include rows of text. Be careful of mid-string # linebreaks! with open("train_plus_test_reviews.csv", "r") as f: tdm = textmining.TermDocumentMatrix() for line in f: tdm.add_doc(line) # Only include words which appear in 2+ documents tdm.write_csv('matrix.csv', cutoff=2)
def texts2matrix(texts, titles, fname='dtm.csv'): M = textmining.TermDocumentMatrix() for text in texts: M.add_doc(text) M.write_csv(fname, cutoff=3) tname = fname.split('.')[0] + '_filename.txt' f = open(tname, 'w') for i in titles: f.write("%s\n" % i) print "matrix saved as " + fname + " and filenames as " + tname
def termdocumentmatrix_example(inputPath, inputFile, outputPath, outputFile): with open(inputPath + inputFile, 'rb') as f: tdm = textmining.TermDocumentMatrix() count = 1 for line in f: vals = line.split('^') try: tdm.add_doc(vals[0]) except IndexError, e: print str(count) + "th row data format error" count = count + 1
def create_termdocument_matrix(self, tokenzied=False): """ creates a term document matrix of the frequencies of each term in each document held in self.docs :return: """ self.tdm = tm.TermDocumentMatrix() for doc in self.docs: if not tokenzied: self.tdm.add_doc(doc) else: self.tdm.add_tokenized_doc(doc)
def create_frequency_matrix(documents, cutoff=2, path_to_save=None): # x = np.array([1, 1, 1, 2, 2, 2, 5, 25, 1, 1]) # y = np.bincount(x) # ii = np.nonzero(y)[0] # return zip(ii, y[ii]) tdm = textmining.TermDocumentMatrix() for doc in documents: tdm.add_doc(doc) if not path_to_save == None: tdm.write_csv(path_to_save, cutoff=cutoff) return tdm.rows(cutoff=cutoff)
def prepare_data_2(clean_reviews=[]): '''prepares reviews by creating the LDA corpus''' tdm = textmining.TermDocumentMatrix() for doc1 in clean_reviews : tdm.add_doc(doc1) temp = list(tdm.rows(cutoff=2)) vocab = tuple(temp[0]) X = np.array(temp[1:]) return X,vocab
def most_frequent_terms(*args): tdm = textmining.TermDocumentMatrix(simple_tokenize_remove_our_stopwords) for doc in args: tdm.add_doc(doc) freqs = [] for d in tdm.sparse: f = [(freq, name) for (name, freq) in list(d.items())] f.sort(reverse=True) freqs.append(f) return freqs
def tdm_df(doclist, stopwords=[], remove_punctuation=True, remove_digits=True, sparse_df=False): ''' Create a term-document matrix from a list of e-mails. Uses the TermDocumentMatrix function in the `textmining` module. But, pre-processes the documents to remove digits and punctuation, and post-processes to remove stopwords, to match the functionality of R's `tm` package. NB: This is not particularly memory efficient and you can get memory errors with an especially long list of documents. Returns a (by default, sparse) DataFrame. Each column is a term, each row is a document. ''' # Create the TDM from the list of documents. tdm = txtm.TermDocumentMatrix() for doc in doclist: if remove_punctuation == True: doc = doc.translate(None, string.punctuation.translate(None, '"')) if remove_digits == True: doc = doc.translate(None, string.digits) tdm.add_doc(doc) # Push the TDM data to a list of lists, # then make that an ndarray, which then # becomes a DataFrame. tdm_rows = [] for row in tdm.rows(cutoff=1): tdm_rows.append(row) tdm_array = np.array(tdm_rows[1:]) tdm_terms = tdm_rows[0] df = DataFrame(tdm_array, columns=tdm_terms) # Remove stopwords from the dataset, manually. # TermDocumentMatrix does not do this for us. if len(stopwords) > 0: for col in df: if col in stopwords: del df[col] if sparse_df == True: df.to_sparse(fill_value=0) return df
def __init__(self): self.docs = None self.X = None self.features = None self.tdm = textmining.TermDocumentMatrix() self.models_list = [] self.k_list = None self.topics_n = None self.topic_labels = [] self.models_matrix = None self.cos_X = None self.cos_list = [] self.ftps = []
def count_terms(site_dict): terms_matrix = textmining.TermDocumentMatrix() for site, text in site_dict.items(): terms_matrix.add_doc(text) terms_df = pd.DataFrame(terms_matrix.rows()) terms_df.columns = terms_df.iloc[0] terms_df = terms_df[1:] terms_df.index = site_dict.keys() terms_df.index.name = 'site' terms_df = terms_df.T terms_df.index.name = 'term' terms_df = terms_df.reset_index() return terms_df
def tdm_df(doclist): tdm = textmining.TermDocumentMatrix() if len(doclist) > 0: for doc in doclist: tdm.add_doc(doc) tdm_rows, occurrence = [], [] for rows in tdm.rows(): tdm_rows.append(rows) tdm_array = np.array(tdm_rows[1:]) tdm_terms = tdm_rows[0] df = pd.DataFrame(tdm_array, columns=tdm_terms) return df
def preprocess(inputFile,f_name): # Read the text file file = open(inputFile, 'r') text = file.read() text = text.replace('\n',' ') #Number of words in the text words_count = len(word_tokenize(strip_punctuation(text))) # split in to sentences and store the sentences in a list sentences = tokenize.sent_tokenize(text) #Original Sentences sentences_backup = list(sentences) filtered_sentences = [] # Apply stop word removal to each sentence stop_words = set(stopwords.words('english')) for i in range(len(sentences_backup)): temp = [] word_tokens = word_tokenize(strip_punctuation(sentences_backup[i])) for w in word_tokens: if w.lower() not in stop_words: temp.append(w.lower()) filtered_sentences.append(temp) tdm = textmining.TermDocumentMatrix() for i in range(len(sentences)): sent = " ".join(filtered_sentences[i]) tdm.add_doc(sent) temp = list(tdm.rows(cutoff=1)) vocab = tuple(temp[0]) X = np.array(temp[1:],dtype = 'float64') X1 = X.transpose() fileObj = ".\\Pre_Processed\\"+f_name.replace('.txt','')+".csv" np.savetxt(fileObj, X1, fmt='%1.5f', delimiter=",") vocab1 = tuple(zip(vocab))
def termdocumentmatrix_example(xDIR): # Initialize class to create term-document matrix count = 0 tdm = textmining.TermDocumentMatrix() for i in os.listdir(xDIR): Res = tdm.add_doc(open(os.path.join(xDIR, i)).read()) # Write out the matrix to a csv file. Note that setting cutoff=1 means # that words which appear in 1 or more documents will be included in # the output (i.e. every word will appear in the output). The default # for cutoff is 2, since we usually aren't interested in words which # appear in a single document. For this example we want to see all # words however, hence cutoff=1. tdm.write_csv('/Users/XW/Desktop/datascience.stackexchange.com/answer.csv', cutoff=1) #输出结果
def fitLDA(self, nTopics, nTopWords): #Fit LDA model topicsList = [] tdm = textmining.TermDocumentMatrix( tokenizer=textmining.simple_tokenize_remove_stopwords) for index, row in self.typeData.iterrows(): if isinstance(row["Title/Description"], basestring): tdm.add_doc(row["Title/Description"]) temp = list(tdm.rows(cutoff=1)) vocab = tuple(temp[0]) X = np.array(temp[1:]) self.model = lda.LDA(n_topics=nTopics, n_iter=500, random_state=1) self.model.fit_transform(X) topicWord = self.model.topic_word_ # model.components_ also works topWords = nTopWords for i, topic_dist in enumerate(topicWord): topicWords = np.array(vocab)[np.argsort(topic_dist)][:-topWords:-1] topicsList.append(topicWords) return topicsList
def termdocumentmatrix_example(xDIR): # Initialize class to create term-document matrix count=0 tdm = textmining.TermDocumentMatrix() for i in os.listdir(xDIR): Res = tdm.add_doc(open(os.path.join(xDIR,i)).read()) # Write out the matrix to a csv file. Note that setting cutoff=1 means # that words which appear in 1 or more documents will be included in # the output (i.e. every word will appear in the output). The default # for cutoff is 2, since we usually aren't interested in words which # appear in a single document. For this example we want to see all # words however, hence cutoff=1. tdm.write_csv('/Users/Zhen/Desktop/Courses/BigData/stackexchange/topicModeling/result/matrix.csv',cutoff=1) #输出结果 # Instead of writing out the matrix you can also access its rows directly. # Let's print them to the screen. for row in tdm.rows(cutoff=1): print row
def construct_doc_term_matrix(tok_folders, indices): ''' Take a list of path to folders where tok.mda file is stored, and return 1. the numpy NumDoc x NumVocab matrix of doctermatrix Document ordered by Folder order and then by CUSIP number 2. number of document in the last folder Folders should not end with "/" Folders should be ordered by time ''' tm_tdm = textmining.TermDocumentMatrix() document_count = 0 document_last_count = 0 for i in range(len(tok_folders)): tok_folder = tok_folders[i] tokfile_list = os.listdir(tok_folder) tokfile_list.sort() document_last_count = 0 index = indices[i] for j in range(len(tokfile_list)): if j not in index: continue tokfile_name = tokfile_list[j] with open(tok_folder + "/" + tokfile_name) as tokfile: line = tokfile.readline() # in the origianl data, # refers to numbers line = re.sub('[#]', 'number', line) tm_tdm.add_doc(line) document_count += 1 document_last_count += 1 np_tdm = 0 row_index = -1 vocab = [] for row in tm_tdm.rows(cutoff=1): if row_index < 0: np_tdm = np.zeros(shape=(document_count, len(row))) vocab = row[:] else: np_tdm[row_index] = row row_index += 1 return np_tdm, vocab, document_last_count
def tdm_df(doclist, stopwords=[], remove_punctuation=True, remove_digits=True, sparse_df=False): """ Create a term-document matrix from a list of e-mails. Uses the TermDocumentMatrix function in the `textmining` module. But, pre-processes the documents to remove digits and punctuation, and post-processes to remove stopwords, to match the functionality of R's `tm` package. """ tdm = txtm.TermDocumentMatrix() for doc in doclist: if remove_punctuation == True: translator_pun = str.maketrans('', '', string.punctuation) doc = doc.translate(translator_pun) if remove_digits == True: translator_digt = str.maketrans('', '', string.digits) doc = doc.translate(translator_digt) tdm.add_doc(doc) # Push the TDM data to a list of lists, # then make that an ndarray, which then # becomes a DataFrame. tdm_rows = [] for row in tdm.rows(cutoff=1): tdm_rows.append(row) tdm_array = np.array(tdm_rows[1:]) tdm_terms = tdm_rows[0] df = DataFrame(tdm_array, columns=tdm_terms) if len(stopwords) > 0: for col in df: if col in stopwords: del df[col] if sparse_df == True: df.to_sparse(fill_value=0) return df
def create_keyword_table(): with open(os.path.join(get_paths()['working_dir'], 'Paper.csv'), 'r', encoding="utf-8") as paper_file: paper = csv.reader(paper_file) paper_column = paper.__next__() tdm = textmining.TermDocumentMatrix() papers = [] for x in paper: papers.append(x) tdm.add_doc( re.sub(r"[^A-Za-z0-9 _]", " ", ' '.join([str(x[1]), str(x[5])]), flags=re.UNICODE)) keywords = [] cutoff = 30 stopwords = textmining.stopwords stopwords.update(['key', 'words', 'keywords', 'keyword', 'word']) for (paper_i, tdm_i) in zip(papers, tdm.sparse): id_paper = paper_i[0] year = paper_i[2] id_conference = paper_i[3] id_journal = paper_i[4] paper_words = [ [id_paper, year, id_conference, id_journal, word] for word in tdm_i.keys() if tdm.doc_count[word] >= cutoff and word not in stopwords ] keywords.extend(paper_words) with open(os.path.join(get_paths()['working_dir'], 'keywords.csv'), 'w', encoding="utf-8") as keyword_file: keyword = csv.writer(keyword_file) keyword.writerow( ('paperid', 'year', 'conferenceid', 'journalid', 'keyword')) for x in keywords: keyword.writerow(x)