def main(): n_samples = 2000 n_features = 1000 n_topics = 20 n_top_words = 15 dataset = fetch_20newsgroups( shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b", max_df=0.9, max_features=n_features, min_df=2, stop_words='english') doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples]) lda = LDA(n_topics=n_topics, kappa=0.7, tau0=1024., n_jobs=4, random_state=0) feature_names = vectorizer.get_feature_names() start_time = time.clock() lda.fit(doc_word_count) end_time = time.clock() # print feature_names[:10] for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) print 'run time = %.3f seconds' % (end_time - start_time)
def main(): n_samples = 2000 n_features = 1000 n_topics = 20 n_top_words = 15 dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b", max_df=0.9, max_features=n_features, min_df=2, stop_words='english') doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples]) lda = LDA(n_topics=n_topics, kappa=0.7, tau0=1024., n_jobs=4, random_state=0) feature_names = vectorizer.get_feature_names() start_time = time.clock() lda.fit(doc_word_count) end_time = time.clock() # print feature_names[:10] for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) print 'run time = %.3f seconds' % (end_time - start_time)
def get_lda_feats(self, args, group_key, indiv_key): d={} for ii in glob.iglob(args.input_big5_folder + '/*.csv'): print '\n--> Adding lda data from file: %s'%(ii) if os.path.basename(ii) == 'gz': csvobj = csv.reader(gzip.open(ii), delimiter=',') else: csvobj = csv.reader(open(ii, 'rb'), delimiter=',') header=csvobj.next() ind_group=header.index(group_key) # year ind_indiv=header.index(indiv_key) # id for jj in csvobj: try: group=jj[ind_group].strip() if not d.has_key(group): d[group]=[] ldaa = LDA(args.feat_folder+'/' + 'lda-' + args.num_topics + '.'+self.feat_name+'-' + group + '.post') indiv=jj[ind_indiv].strip() d[group].append(ldaa.posterior_feats(indiv)) print indiv except: continue return d
def configure(config): global lda, train_topics lda = LDA(config['lda_model'], config['lda_dict']) lda.load() logging.info('Loading training corpus topics') with open(config['train_topics']) as fp: train_topics = cPickle.load(fp) logging.info('Read topics for %d sentences', len(train_topics))
def main(): corpus = Corpus() corpus.load_ldac(menu_path + 'reuters.ldac') model = LDA(n_topic=20) model.fit(corpus, valid_split=0.1, n_iter=10) perplexity = model.perplexity(corpus.docs) print perplexity
def extract_aspects_for_reviews_v1(topic_num=10): ''' load model from file, then obtain topics for every reivew some terms may not be in the model.id2term, ignore this in this version ''' raw_review_filename = 'raw_reviews.ldapre' raw_texts = load_reviews(raw_review_filename) #train LDA #lda_model = LDA(K=K, doc_set=raw_texts) #lda_model.train() #lda_model.save(yelp_dir + 'review_t%s.lda' % K) #return the topic for every reivew version = 'v1' model_filename = yelp_dir + 'lda_%s/review_t%s.lda' % (version, topic_num) res = [] model = LDA(model_filename=model_filename, load_from_file=True) res_dir = yelp_dir + 'aspects/lda_%s/' % version res_filename = 'review_topic%s.res' % topic_num fw = open(res_dir + res_filename, 'w+') fw.write('#review_id\trate\ttopic_res\traw_text\n') model_topic_filename = 'topic%s.res' % topic_num tn = 0.0 start = time.time() for ind, t in enumerate(raw_texts): rid = t[0] rate = float(t[1]) terms = t[2:] topic_ids = model.get_document_topics(terms) #checked_res = model.check_existence_doc_term(terms) #if checked_res: # print 'ind=%s, rid=%s, not_exist:%s, raw_texts=%s' % (ind,rid,checked_res,terms) tn += len(topic_ids) topic_str = '|'.join( ['%s,%s' % (t, round(p, 4)) for t, p in topic_ids]) line = '%s\t%s\t%s\t%s' % (unicode2str(rid), rate, unicode2str(topic_str), '\t'.join( [unicode2str(t) for t in terms])) #line = '\t'.join([unicode2str(t) for t in terms]) fw.write(line + '\n') if (ind + 1) % 100000 == 0: print 'cost %.1fmin in this round, processed %s review:\n%s\n' % ( (time.time() - start) / 60.0, ind + 1, line) start = time.time() fw.close() topics_res = model.print_topics(topic_num) fw = open(res_dir + model_topic_filename, 'w+') fw.write('\n'.join(['%s,%s' % (t, unicode2str(r)) for t, r in topics_res])) fw.close() print 'finish extracting aspects for %s reviews(avg=%s), saved in %s, corpurs topics in %s' % ( len(raw_texts), tn / len(raw_texts), res_filename, model_topic_filename)
class TopicModelingLDA(object): #wrapper de la libreriar LDA #permite caracterizar los topicos en base a varios scores encontrados en la literatura def __init__(self,corpus,metrics_criteria='simple'): super(TopicModelingLDA, self).__init__() self.corpus = corpus self.select_metric_criteria(metrics_criteria) self.model = None self.topic_words = None self.top_words = None self.all_words = [] def fit(self,num_topic=5,n_iter=1500): count_vect = CountVectorizer() x_train_counts = count_vect.fit_transform(self.corpus) self.model = LDA(n_topics=num_topic, n_iter=n_iter, random_state=1) self.model.fit(x_train_counts) self.topic_words = self.model.topic_word_ self.vocabulary = count_vect.get_feature_names() def select_metric_criteria(self,metrics_criteria): if metrics_criteria == 'term_score': self.metrics = TopicTermScore() else: self.metrics = TopicSimpleScore() def get_highest_scores(self,k_top=10): #topic_words es una matriz (numero de topicos,palabras) #la fila k indica la distribucion de palabras del topico k num_topics = len(self.topic_words) print ("Numero de topicos",num_topics) top_words = [] self.top_words = {} for topic_k in range(num_topics): scores = [] for v,word in enumerate(self.vocabulary): score = self.metrics.calculate(self.topic_words,topic_k,v) scores.append((word,score)) scores.sort(key=lambda tup: tup[1]) scores = scores[-k_top:] print ("Topico %d"%(topic_k)) for word,score in scores: print ("%s,%.4f"%(word,score)) print ("") self.top_words[topic_k] = [{'word':word,'score':score} for word,score in scores] self.all_words += [ word for word,score in scores] return self.top_words def get_all_words(self): return self.all_words
def DocIndex(): core = TermiteCore(request, response) lda = LDA(request) docIndex, docMaxCount = lda.GetDocIndex() return core.GenerateResponse( lda.params, { 'docCount': len(docIndex), 'docMaxCount': docMaxCount, 'DocIndex': docIndex })
def TermIndex(): core = TermiteCore(request, response) lda = LDA(request) termIndex, termMaxCount = lda.GetTermIndex() return core.GenerateResponse( lda.params, { 'termCount': len(termIndex), 'termMaxCount': termMaxCount, 'TermIndex': termIndex })
def TopicIndex(): core = TermiteCore(request, response) lda = LDA(request) topicIndex, topicMaxCount = lda.GetTopicIndex() return core.GenerateResponse( lda.params, { 'topicCount': len(topicIndex), 'topicMaxCount': topicMaxCount, 'TopicIndex': topicIndex })
def generate_topics(self): file_to_tokens = self._get_normalized_corpus(self.files) np_matrix = self._get_document_term_matrix(file_to_tokens) model = LDA(n_topics=self.n_topics, n_iter=self.n_iter, random_state=self.random_state) model.fit(np_matrix) self._lda_model = model
def test_aggregate(self): data = [['123', 'some text'],['123', 'some more text'], ['123', 'evn more text'], ['456', 'some stuff'],['456', 'some more stuff'], ['456', 'even more stuff'], ['789','just a little thing.']] df = pd.DataFrame(data=data, columns=['asin','review_text']) bcr = LDA() result = bcr.aggregate_df(df, 'asin', 'review_text') # This should aggregate to only three rows. self.assertEqual(len(result),3) print(result)
def test_there_is_data(self): bcr = LDA() categories, category_ids, asin_counts = bcr.get_categories('category.csv') i = 0 for category in categories: asin_count = asin_counts[i] i = i + 1 dataset = bcr.get_dataset('category.csv', category) print("Dataset {} has {} records.".format(category, len(dataset))) self.assertEqual(len(dataset), asin_count) print("Displaying the first 10 elements in the dataset...") print(dataset.head(10))
def train(request): if request.is_ajax(): if request.method == 'GET': print 'train func' db = StateModel(state_name='lock_model', status=1) db.save() # neural_network = neural_net(75, 3) # neural_network.create_struct(150) file_train = settings.STATICFILES_DIRS[ 0] + 'main_app/media/train_data.csv' lda = LDA(75, 3) #neural_network.data_input(feature_list, answer, "train") #neural_network.file_input(file_train) lda.file_input(file_train) #neural_network.file_input(file_test, type_set='test') #test_data, test_answer = neural_network.get_test_data() # neural_network.training(5000) # neural_network.save_model(settings.STATICFILES_DIRS[0]) lda.training() lda.save_model(settings.STATICFILES_DIRS[0]) #print neural_network.predict(test_data[0]) #print test_answer[0] db = StateModel(state_name='lock_model', status=0) db.save() return HttpResponse('OK')
def main(model, dic, corpus, output): logging.basicConfig(level=logging.INFO) lda = LDA(model, dic) lda.load() topics = [] with open(corpus) as fp: n_sentences = sum(1 for line in fp) logging.info('Computing topic vectors for %d sentences', n_sentences) bar = pb.ProgressBar(widgets=[pb.Percentage(), pb.Bar(), pb.ETA()], maxval=n_sentences) with open(corpus) as fp: for sentence in bar(fp): topics.append(lda.topic_vector(sentence.split())) logging.info('Saving topic information to %s', output) with open(output, 'w') as fp: cPickle.dump(np.vstack(topics), fp, protocol=cPickle.HIGHEST_PROTOCOL)
def train_model(self, filename, model_name): self.create_label_corpus(filename) self.lda = LDA(self.options.K, self.options.alpha, self.options.beta) self.lda.set_corpus(self.labelset, self.corpus, self.labels) print "M=%d, V=%d, L=%d, K=%d" % (len(self.corpus), len(self.lda.vocas), len(self.labelset), self.options.K) for index in range(self.options.iteration): sys.stderr.write("-- %d : %.4f\n" % (index, self.lda.perplexity())) print "perplexity : %.4f" % self.lda.perplexity() phi = self.lda.phi() theta = self.lda.theta() new_stopword = [] for k, label in enumerate(self.labelset): print "\n-- label %d : %s" % (k, label) for w in numpy.argsort(-phi[k]): print "%s: %f" % (self.lda.vocas[w], phi[k,w]) self.save_model(model_name)
def train_acc(data_path, algorithm_name): print(data_path) x, y, test_x, test_y = data.run(data_path) clf = None if algorithm_name == "gnb": clf = GNB() print("gnb instance.") elif algorithm_name == "lda": clf = LDA() print("lda instance.") elif algorithm_name == "qda": clf = QDA() print("qda instance.") else: print("NO Implement") return "NO Implement" num = 0 clf.fit(x, y) train_result = clf.predict(x) for i in range(len(train_result)): if train_result[i] == y[i]: num += 1 return num / len(y)
def plot_unsmoothed(): corpus, T = generate_corpus() L = LDA(T) L.train(corpus, verbose=False) fig, axes = plt.subplots(1, 2) ax1 = sns.heatmap(L.beta, xticklabels=[], yticklabels=[], ax=axes[0]) ax1.set_xlabel("Topics") ax1.set_ylabel("Words") ax1.set_title("Recovered topic-word distribution") ax2 = sns.heatmap(L.gamma, xticklabels=[], yticklabels=[], ax=axes[1]) ax2.set_xlabel("Topics") ax2.set_ylabel("Documents") ax2.set_title("Recovered document-topic distribution") plt.savefig("img/plot_unsmoothed.png", dpi=300) plt.close("all")
def fit(self,num_topic=5,n_iter=1500): count_vect = CountVectorizer() x_train_counts = count_vect.fit_transform(self.corpus) self.model = LDA(n_topics=num_topic, n_iter=n_iter, random_state=1) self.model.fit(x_train_counts) self.topic_words = self.model.topic_word_ self.vocabulary = count_vect.get_feature_names()
def train(request): if request.is_ajax(): if request.method == 'GET': print 'train func' db = StateModel(state_name='lock_model', status=1) db.save() # neural_network = neural_net(75, 3) # neural_network.create_struct(150) file_train = settings.STATICFILES_DIRS[0]+'main_app/media/train_data.csv' lda = LDA(75,3) #neural_network.data_input(feature_list, answer, "train") #neural_network.file_input(file_train) lda.file_input(file_train) #neural_network.file_input(file_test, type_set='test') #test_data, test_answer = neural_network.get_test_data() # neural_network.training(5000) # neural_network.save_model(settings.STATICFILES_DIRS[0]) lda.training() lda.save_model(settings.STATICFILES_DIRS[0]) #print neural_network.predict(test_data[0]) #print test_answer[0] db = StateModel(state_name='lock_model', status=0) db.save() return HttpResponse('OK')
def lda(fname, indF, nTopics=20, iterations=50, fmax=math.inf, ofhead='cancer_py_cust_gvLDA_'): cts = pd.read_csv(fname + '.csv', header=0, index_col=0, dtype={0: str}) ind = pd.read_csv(indF + '.csv', header=None) patID = cts.index gvID = cts.columns rows = np.where(ind > 0)[0] splits = np.max(np.array(ind)) patID = patID[rows] phi = cts.iloc[rows] ind = ind.iloc[rows, 0] for i in range(1, splits + 1): ofname = ofhead + str(nTopics) + '_' + str(i) # training set rowsT = np.where(ind != i) X = np.asarray(phi.iloc[rowsT]) cols = X.sum(axis=0) < fmax X = X[:, cols] # valid set rowsV = np.where(ind == i) X_test = np.asarray(phi.iloc[rowsV]) X_test = X_test[:, cols] lda = LDA(nTopics) patTop, gvTop = lda.train(X, iters=iterations) ofname = 'data/' + ofname gvTop = pd.DataFrame(gvTop) gvTop.columns = np.asarray(gvID)[cols] gvTop.to_csv(ofname + '_genes.csv') pd.DataFrame(lda.alpha).to_csv(ofname + '_alpha.csv') patTop = pd.DataFrame(patTop) patTop.index = patID[rowsT] patTop.to_csv(ofname + '_train.csv') patTop = lda.predict(X_test, iters=iterations) patTop = pd.DataFrame(patTop) patTop.index = patID[rowsV] patTop.to_csv(ofname + '_valid.csv')
def _getLDA(text, label, n_topic_words): vectorizer = CountVectorizer(min_df=100, max_df=5000) transformer = TfidfTransformer() df = vectorizer.fit_transform(text) tfidf_word_name = vectorizer.get_feature_names() model = LDA(n_topics=20, n_iter=1000, random_state=1) model.fit(df) Dump(model, 'LDA_model', 'joblib') topic_word = model.topic_word_ doc_topic = model.doc_topic_ with open('topic_word.txt', 'w') as f: n_top_words = 300 for i, topic_dist in enumerate(topic_word): topic_words = np.array(tfidf_word_name)[np.argsort( topic_dist)][:-(n_top_words + 1):-1] f.write('Topic {}: {}'.format(i, ' '.join(topic_words)) + '\n') return topic_word, doc_topic
def __init__(self, docs, K, alpha, eta): LDA.__init__(self, docs, K, alpha, eta) ### Gibbs sampler related data structures ### # C_VK[w,k] := number of times word w is assigned to topic k self.C_VK = np.zeros((self.V, self.K), dtype=int) # C_DK[d,k] := number of times topic k is present in document d self.C_DK = np.zeros((self.D, self.K), dtype=int) # Cache these values as we go (equivalent to performing column sums for above matrices) # For each document, total number of topics assigned self.total_topics_per_doc = np.zeros(self.D) # For each topic, total number of words assigned to it self.total_words_per_topic = np.zeros(self.K) # Save results here self.log_prob = [] self.samples = []
def learn_topics(textpath, topicnum): with open(textpath) as f: texts = f.readlines() # Get vocabulary and word counts. Use the top 10,000 most frequent # lowercase unigrams with at least 3 alphabetical, non-numeric characters, # punctuation treated as separators. CVzer = CountVectorizer(token_pattern=r"(?u)\b[^\W\d]{3,}\b", max_features=None, lowercase=True) doc_vcnts = CVzer.fit_transform(texts) vocabulary = CVzer.get_feature_names() # Learn topics. Refresh conrols print frequency. lda_model = LDA(topicnum, n_iter=8000, refresh=2000) doc_topic = lda_model.fit_transform(doc_vcnts) topic_word = lda_model.topic_word_ return doc_topic, topic_word, vocabulary
def fit_reuters(): corpus = Corpus() corpus.load_ldac(menu_path + 'reuters.ldac') model = LDA(n_topic=20) model.fit(corpus, n_iter=50) model.save_model(protocol=2)
def exampleLDAExecution(): X = data.load_reuters() vocab = data.load_reuters_vocab() titles = data.load_reuters_titles() # document-term matrix X = data.load_reuters() print("type(X): {}".format(type(X))) print("shape: {}\n".format(X.shape)) # the vocab vocab = data.load_reuters_vocab() print("type(vocab): {}".format(type(vocab))) print("len(vocab): {}\n".format(len(vocab))) # titles for each story titles = data.load_reuters_titles() print("type(titles): {}".format(type(titles))) print("len(titles): {}\n".format(len(titles))) doc_id = 0 word_id = 3117 print("doc id: {} word id: {}".format(doc_id, word_id)) print("-- count: {}".format(X[doc_id, word_id])) print("-- word : {}".format(vocab[word_id])) print("-- doc : {}".format(titles[doc_id])) model = LDA(n_topics=20, n_iter=500, random_state=1) model.fit(X) # model.fit_transform(X) is also available topic_word = model.topic_word_ # model.components_ also works n_top_words = 10 for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) doc_topic = model.doc_topic_ for i in range(10): print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))
def RunLDA(FileLocation, NumDocs, NumTopics): # In order to create a Term Document matrix, # We read in every file and then make a list containing the body of # all of the articles fin=open(FileLocation,'r') #Will need to store the urls when we make the tdm UrlArray = [] #Create TDM object. It will also remove stopwords TDM = TermDocumentMatrix(simple_tokenize_remove_stopwords) # Add each article to the TDM object. Also create a list of urls # This is a massive corpus, so we are only doing this for 300 articles. for i in range(NumDocs): Article = fin.next() UrlArray.append(re.split(r'\t',Article)[0]) TDM.add_doc(re.split(r'\t',Article)[1]) # Rows in TDM is an iterable # We can't have that to input it into numpy X = list(TDM.rows()) # Oddly enough the first row of the .rows() iterable in TDM returms a # List of all of the words used. Think of it as a header file Vocab = X[0] Y = [] #creating a 2d list containing the rows of the document matrix for i in range(len(X)-1): Y.append(X[i+1]) # Create the LDA model object. 20 topics this time, but that can be changed. model = LDA(n_topics=20, n_iter=1500, random_state=1) # Make a numpy Array to use as input Yarray = np.asarray(Y) #Fit the model. This process is similiar to scikit-learn's algorithms model.fit(Yarray) TopicWords = [] topic_word = model.topic_word_ n_top_words = 50 for i, topic_dist in enumerate(topic_word): topic_words = np.array(Vocab)[np.argsort(topic_dist)][:-n_top_words:-1] TopicWords.append(topic_words) print('Topic {}: {}'.format(i, ' '.join(topic_words)))
def test_lda(model_file, dict_file, dbs_dir): """ Run training and display test results if visualize is true Args: model_file(str): saved model file to continue training on dict_file(str): dict_file path to load dictionary from dbs_dir(str): dir path to load databases from """ assert (os.path.isdir(dbs_dir)), "Invalid data directory path" lda = LDA() print 'Loading existing dictionary...' lda.load_dict_from_disk(dict_file) test_results = list() #Iterate over all data and train model for root, dirs, files in os.walk(dbs_dir): #Iterate over sub-dirs for d in files: db = Database() #Load database object from saved file db.load_from_disk(dbs_dir + '/' + d) #Add database to model lda.add_database(db) #Test model test_results.append(lda.test(model_file, db_name=db.get_name())) lda.remove_database(db.get_name()) del db gc.collect() #Print test results for idx, i in enumerate(test_results): print('Test results for database {}'.format(idx)) for j in i[0]: print('Topic: {} has probability: {}'.format(j[0], j[1])) counter = 0 for k in i[1]: print('Topic {} has topic-coherence score: {}'.format( counter, k[1])) counter += 1 print lda.model.show_topics()
class TestLDA(unittest.TestCase): """ Test the LDA class. """ def setUp(self): self.description_csv = pd.read_csv("docs/description.csv") self.description_1000_csv = pd.read_csv("docs/description_1000.csv") self.dp = DocsPreprocessor() self.description_1000 = self.dp.process(self.description_1000_csv) self.lda = LDA(self.description_1000) def test_1(self): k_values, coherence_values, topic_list = self.lda.compute_coherence_values( 5, 20, 5)
def appDescriptionsLDA(): X = data.load_reuters() vocab = data.load_reuters_vocab() titles = data.load_reuters_titles() print X print vocab print titles X.shape X.sum() model = LDA(n_topics=20, n_iter=500, random_state=1) model.fit(X) # model.fit_transform(X) is also available topic_word = model.topic_word_ # model.components_ also works n_top_words = 10 for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) doc_topic = model.doc_topic_ for i in range(10): print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))
def main(dataset, compute_errors=True, plot_boundaries=True, save=False): """ Fit the four models on the training sets, depending on the parameters compute the accuracy et plot the boundary args :: dataset : array(str) """ filename = "data/" + dataset + ".train" x_train, y_train = read_file(filename) filename = "data/" + dataset + ".test" x_test, y_test = read_file(filename) models = [ LDA(x_train, y_train), LinearRegression(x_train, y_train), LogisiticRegression(x_train, y_train), QDA(x_train, y_train) ] model_names = ["LDA", "LinearRegression", "LogisiticRegression", "QDA"] for i, model in enumerate(models): model_name = model_names[i] model.fit() if compute_errors: y_pred_train = [model.predict(x) for x in x_train] e = accuracy(y_train, y_pred_train) print("Accuracy with " + model_name) print("Training: ", e) y_pred_test = [model.predict(x) for x in x_test] e = accuracy(y_test, y_pred_test) print("Testing: ", e) if plot_boundaries: model.plot_boundary() plt.scatter(model.x[:, 0], model.x[:, 1], c=model.y, s=1) title = "Model: " + model_name + ", " + dataset + " (Train)" plt.title(title) if save: plt.savefig("figs/" + model_name + "_" + dataset[-1] + "Train.png") plt.show() model.plot_boundary() plt.scatter(x_test[:, 0], x_test[:, 1], c=y_test, s=1) title = "Model: " + model_name + ", " + dataset + " (Test)" plt.title(title) if save: plt.savefig("figs/" + model_name + "_" + dataset[-1] + "Test.png") plt.show()
def output_reuters(): model = LDA() model.load_model() corpus = Corpus() corpus.load_ldac(menu_path + 'reuters.ldac') corpus.load_vocabulary(menu_path + 'reuters.tokens') corpus.load_context(menu_path + 'reuters.titles') topic_word = model.topic_word(n_top_word=10, corpus=corpus) print '\n'.join(map(str, topic_word)) document_topic = model.document_topic(n_top_topic=1, corpus=corpus, limit=10) print '\n'.join(map(str, document_topic))
def __init__(self, block, num_blocks, n_classes, lda_args): super(ResNet, self).__init__() self.lda_args = lda_args if self.lda_args: # LDA self.in_planes = 32 self.out_planes = 16 else: # Usual CNN with CE loss self.in_planes = 32 self.out_planes = 16 # 64 self.conv1 = nn.Conv2d(3, self.in_planes, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(self.in_planes) self.layer1 = self._make_layer(block, self.out_planes * 1, num_blocks[0], stride=1) self.layer2 = self._make_layer(block, self.out_planes * 2, num_blocks[1], stride=2) self.layer3 = self._make_layer(block, self.out_planes * 4, num_blocks[2], stride=2) self.layer4 = self._make_layer(block, self.out_planes * 8, num_blocks[3], stride=2) if self.lda_args: self.lda = LDA(n_classes, lda_args['lamb']) else: self.linear = nn.Linear(self.out_planes * 8 * block.expansion, n_classes)
train_re_path = '../data/train/relevant.txt' train_ir_path = '../data/train/irrelevant.txt' test2_ir_path = '../data/test2/irrelevant.txt' test2_re_path = '../data/test2/relevant.txt' test1_ir_path = '../data/test1/irrelevant.txt' test1_re_path = '../data/test1/relevant.txt' words_dict, idx_dict = create_dict(full_path, stop_words) train_X = load_data(train_path) train_X = word_to_idx(train_X, words_dict) lda = LDA(5) lda.fit(train_X, words_dict.items()) test1_re_X = load_data(test1_re_path) test1_re_X = word_to_idx(test1_re_X, words_dict) test1_ir_X = load_data(test1_ir_path) test1_ir_X = word_to_idx(test1_ir_X, words_dict) test2_re_X = load_data(test2_re_path) test2_re_X = word_to_idx(test2_re_X, words_dict) test2_ir_X = load_data(test2_ir_path) test2_ir_X = word_to_idx(test2_ir_X, words_dict) target_X = load_data(target_path) target_X = word_to_idx(target_X, words_dict)
class Classifier: def __init__(self, options): self.options = options self.file_dir = "./build/" self.labels= [] self.corpus = [] if not os.path.exists(self.file_dir): os.makedirs("build") self.stopwords = self.get_stopwords() def train_model(self, filename, model_name): self.create_label_corpus(filename) self.lda = LDA(self.options.K, self.options.alpha, self.options.beta) self.lda.set_corpus(self.labelset, self.corpus, self.labels) print "M=%d, V=%d, L=%d, K=%d" % (len(self.corpus), len(self.lda.vocas), len(self.labelset), self.options.K) for index in range(self.options.iteration): sys.stderr.write("-- %d : %.4f\n" % (index, self.lda.perplexity())) print "perplexity : %.4f" % self.lda.perplexity() phi = self.lda.phi() theta = self.lda.theta() new_stopword = [] for k, label in enumerate(self.labelset): print "\n-- label %d : %s" % (k, label) for w in numpy.argsort(-phi[k]): print "%s: %f" % (self.lda.vocas[w], phi[k,w]) self.save_model(model_name) def lemmatize(self, string): return WordNetLemmatizer().lemmatize(string, pos='v') def create_label_corpus(self,filename): with open(os.path.join(self.file_dir,filename)) as model: for row in model: label_class_list = [] selected_words = [] split_row = row.lower().split("\"|\"") label_array = self.filter_split(split_row[0]) # Create Unicoded label_type for label_type in self.filter_split(split_row[1]): label_class_list.append(unicode(label_type,"utf-8")) for word in label_array: lemmatized_word = self.lemmatize(word) if word not in self.stopwords and len(word) > 2 and not bool(re.search(r'\d',lemmatized_word)) and lemmatized_word not in self.stopwords: selected_words.append(lemmatized_word) self.corpus.append(selected_words) self.labels.append(label_class_list) self.labelset = list(set(reduce(list.__add__, self.labels))) def filter_split(self,label): return re.sub(r'\W+',' ',label).split() def classify(self,model_name,label): self.lda = self.load_model(model_name) self.stopwords = self.get_stopwords() result_vector = numpy.zeros(self.lda.K) phi = self.lda.phi() label_array = self.filter_split(label) for word in label_array: for r in range(self.lda.K): lemmatized_word = self.lemmatize(word) if word not in self.stopwords and len(word) > 2 and not bool(re.search(r'\d',lemmatized_word)) and lemmatized_word not in self.stopwords and lemmatized_word in self.lda.vocas_id: result_vector[r] += phi[r,self.lda.vocas_id[lemmatized_word]] result = 0 if result_vector.argmax() == 0: v = max(n for n in result_vector if n != max(result_vector)) result = numpy.argwhere(result_vector == v) else: result = result_vector.argmax() print self.lda.labelmap.keys()[self.lda.labelmap.values().index(result)] return self.lda.labelmap.keys()[self.lda.labelmap.values().index(result)] def save_model(self, model_name): with open(os.path.join(self.file_dir,model_name + "_trained.p"),'wb') as model_file: pickle.dump(self.lda,model_file,protocol=pickle.HIGHEST_PROTOCOL) def load_model(self,model_name): if os.path.isfile(os.path.join(self.file_dir,model_name+ "_trained.p")): with open(os.path.join(self.file_dir,model_name + "_trained.p"),'rb') as model_file: return pickle.load(model_file) else: print "Trained model for %s is not found in \"%s\" directory" % ((model_name), (file_dir)) print "Please train the model" def get_stopwords(self): return Stopword(self.file_dir).get_stopwords()
alpha = 1./5 lmda = 1./2 #Top down LDA data X = sp.coo_matrix((M, V)).tolil() beta = np.zeros((K, V)) for k in range(K): beta[k, :] = np.random.dirichlet(np.ones(V)*lmda) for d in range(M): theta_d = np.random.dirichlet(np.ones(K)*alpha) zs = np.random.choice(np.arange(K), size=numwords, p=theta_d) for z in zs: w_n = np.random.choice(np.arange(V), p=beta[z, :]) X[d, w_n] += 1 lda = LDA(alpha=alpha, lmda=lmda, nr_em_epochs=10) print "No collapsing" props, word_props, log_Xsno, perpno = lda.gibbs_sample(X) # plt.plot(range(len(log_Xs)), log_Xs, '*-') # plt.show() # # plt.plot(range(len(perp)), perp, 'o-') # plt.show() print "Perplexity:" print perpno print "logX:" print log_Xsno print "All collapsed"
def clustering_measure(self, n_cluster): km = KMeans(n_cluster) km.fit(self.doc_features) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(self.doc_class, km.labels_)) def cross_validation(self): X_train, X_test, y_train, y_test = cross_validation.train_test_split( self.doc_features, self.doc_class, test_size=0.4, random_state=0) clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) print ("Cross-Validation Score: %.3f" % clf.score(X_test, y_test)) if __name__ == '__main__': # load dataset dataset = CNN() dataset.load_data('/home/yi/Dropbox/workspace/data/cnn/') # train lda lda = LDA(5) lda.initialize(dataset.data_matrix) #lda.load_label('labels.txt', dataset.dictionary) for iter in range(20): lda.fit(dataset.data_matrix) lda.fininsh() lda.print_top_words(dataset.dictionary, 10) # evaluate lda eval = Evaluator(dataset, lda) eval.clustering_measure(n_cluster=5) eval.cross_validation()
# bow = bow / bow.sum(axis=1)[:, None] # Number of docs n_docs = bow.shape[0] # Number of unique words in the vocabulary n_vocab = bow.shape[1] # Number of dimensions in a single word vector n_units = 256 # number of topics n_topics = 20 batchsize = 128 counts = corpus.keys_counts[:n_vocab] # Get the string representation for every compact key words = corpus.word_list(vocab)[:n_vocab] model = LDA(n_docs, n_topics, n_units, n_vocab) if os.path.exists('lda.hdf5'): print "Reloading from saved" serializers.load_hdf5("lda.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 fraction = batchsize * 1.0 / bow.shape[0] for epoch in range(50000000): if epoch % 100 == 0: p = cuda.to_cpu(model.proportions.W.data).copy() f = cuda.to_cpu(model.factors.W.data).copy() w = cuda.to_cpu(model.embedding.W.data).copy() d = prepare_topics(p, f, w, words)
[4, 5, 8, 9, ], ] #~ l = [ #~ [1,2,3,4,5,], #~ [1,2,3,4,5,], #~ [6,7,8,9,10,], #~ [6,7,8,9,10,], #~ [1,2,3,4,5,], #~ ] for d in l: yield d if __name__ == "__main__": # Format: (name of analysis, number of topics, alpha, beta, burn, length, dataset feature vector iterator) given = [ #~ ("test", 2, 0.1, 0.1, 100, 10, test_data), ('state_of_the_union', 5, 0.1, 0.1, 499, 1, state_of_the_union), ] for settings in given: analysis = LDA(settings[1], settings[2], settings[3], settings[4], settings[5]) print(settings[0]) analysis.run_analysis(settings[6]()) analysis.print_topics(10) with io.open('results_%s.json'%(settings[0]), 'w', encoding='utf-8', errors='ignore') as f: f.write(unicode(json.dumps(analysis.log_likelihoods)))
from lda import LDA, _doc_update, _slice_doc_update import pickle import numpy as np np.seterr(divide="raise") from data.datafile import AADataFile dfile = pickle.load(open("data/datafile.pkl")) dt = dfile.DT te = dfile.TE f = te.toarray().argmax(axis=1) lda = LDA(K=10, n_jobs=8, nr_em_epochs=20) perp, b, g, e = lda.fit(dt, f)
def __init__(self, n_topics, alpha=0.1, beta=0.01, random_state=0): LDA.__init__(self, n_topics, alpha=0.1, beta=0.01, random_state=0)
from lda import LDA from settings import demo_dataset_dir model = LDA() model.train(dataset_dir=demo_dataset_dir, output_final_result=True)
def run(self, mode, cntStatus = True, saveVid = False, showVid = True ): lbp = lbp_feature() # neural_network = neural_net(75, 3) # neural_network.create_struct(150) # neural_network.load_model(settings.STATICFILES_DIRS[0]) lda = LDA(75, 3) #lda.create_struct(150) if mode == 'predict': lda.load_model(settings.STATICFILES_DIRS[0]) self.video.set(cv2.cv.CV_CAP_PROP_POS_MSEC, 0) kernel = np.ones((10, 10), np.uint8) lanes = [[] for x in range(self.totalLane)] totalCars = [0] * self.totalLane num_car_detect = 0 self.timer = threading.Timer(5.0, self.progress) self.timer.start() while self.video.isOpened(): ret, frame = self.video.read() if not ret: break frameOrigin = deepcopy(frame) res = frame self.num_frame +=1 for point in self.lanePoints: cv2.polylines(frame, [point], True, (0, 255, 0), 3) filteredFrame = cv2.GaussianBlur(frame, (5, 5), 0) if self.fgMask is None: self.fgMask = self.subtractor.apply(filteredFrame, -1) test = deepcopy(self.fgMask) self.fgMask = self.subtractor.apply(filteredFrame, self.fgMask, -1) self.fgMask = cv2.dilate(self.fgMask, kernel, iterations=1) self.fgMask = cv2.erode(self.fgMask, kernel, iterations=1) self.fgMask = cv2.morphologyEx(self.fgMask, cv2.MORPH_CLOSE, np.ones((30, 30), np.uint8)) self.fgMask = cv2.morphologyEx(self.fgMask, cv2.MORPH_CLOSE, np.ones((30, 30), np.uint8)) self.fgMask = cv2.morphologyEx(self.fgMask, cv2.MORPH_OPEN, np.ones((5, 5), np.uint8)) tempMask = deepcopy(self.fgMask) carImg = cv2.bitwise_and(frameOrigin, frameOrigin, mask=self.fgMask) # Section tracking and Detection contours, hrc = cv2.findContours(tempMask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_KCOS) isIn = [False] * self.totalLane laneObj = [[] for x in range(self.totalLane)] outLane = [[] for x in range(self.totalLane)] for obj in contours: moment = cv2.moments(obj) if moment['m00'] == 0: continue cx = int(moment['m10']/moment['m00']) cy = int(moment['m01']/moment['m00']) pX, pY, w, h = cv2.boundingRect(obj) isNotLane = True for numLane in range(len(self.laneContours)): if cv2.pointPolygonTest(self.laneContours[numLane][0], (cx, cy), False) == 1: car_object = {"centroid": (cx, cy+h/2), "origin": (pX, pY), "height": h, "width": w} laneObj[numLane].append(car_object) isNotLane = False break if isNotLane: for numLane in range(len(self.laneContours)): lanePoint = self.lanePoints[numLane] if cx >= lanePoint[3][0][0] and cx <= lanePoint[2][0][0]\ and cy >= lanePoint[3][0][1] and cy <= lanePoint[3][0][1]+50: car_object = {"centroid": (cx, cy+h/2), "origin": (pX, pY), "height": h, "width": w} outLane[numLane].append(car_object) for numLane in range(len(self.laneContours)): for i in outLane[numLane]: diffRange = 50 foundedObj = None for j in lanes[numLane]: diff = math.fabs(j["point"][0][0] - i["centroid"][0]) + math.fabs(j["point"][0][1] - i["centroid"][1]) if diff < diffRange: diffRange = diff foundedObj = j if foundedObj is not None: totalCars[numLane] += 1 originX = i["origin"][0] originY = i["origin"][1] crop_img = frameOrigin[originY:originY + i["height"], originX:originX+i["width"]] normal_image = cv2.resize(crop_img, (64, 64)) num_car_detect += 1 if mode == 'train': directory = settings.STATICFILES_DIRS[0]+'main_app/media/train_image/' if not os.path.exists(directory): os.makedirs(directory) cv2.imwrite(directory + 'car'+str(num_car_detect)+'.png', crop_img) if mode == 'predict': height, width, channels = crop_img.shape size_data = [height/100.0, width/100.0, height * width/10000.0] lbp.read_image(normal_image) feature = lbp.extract_feature(size_data[0], size_data[1], size_data[2]) #answer = neural_network.predict(feature) answer = int(lda.predict(feature)) save_type(self.video_name, answer, self.num_frame) if answer == 2: self.typeCar["small"] += 1 elif answer == 1: self.typeCar["medium"] += 1 else: self.typeCar["large"] += 1 print answer file_name = self.video_name[:self.video_name.find('.avi')] + '.png' path = settings.STATICFILES_DIRS[0]+'main_app/media/result_image/'+str(num_car_detect)+'-'+str(answer)+'-'+file_name cv2.imwrite(path, crop_img) lanes[numLane].remove(foundedObj) for i in lanes[numLane]: i["stat"] = False for i in laneObj[numLane]: diffRange = 50 foundedObj = None for j in lanes[numLane]: diff = math.fabs(j["point"][0][0] - i["centroid"][0]) + math.fabs(j["point"][0][1] - i["centroid"][1]) if diff < diffRange: diffRange = diff foundedObj = j if foundedObj is not None: foundedObj["point"].insert(0, i["centroid"]) foundedObj["stat"] = True else: lanes[numLane].append({ "point": [i["centroid"]], "stat": True }) tempLane = [] for i in lanes[numLane]: if i["stat"]: tempLane.append(i) cv2.polylines(res, np.int32([i["point"]]), False, (0, 255, 255), 3) lanes[numLane] = tempLane # Section Draw TrackLine for obj in contours: moment = cv2.moments(obj) if moment['m00'] == 0: continue pX, pY, w, h = cv2.boundingRect(obj) cx = int(moment['m10']/moment['m00']) cy = int(moment['m01']/moment['m00'])+h/2 cv2.circle(res, (cx, cy), 3, (0, 0, 255), 4) distance = [] for numLane in range(len(self.laneContours)): distance.append(cv2.pointPolygonTest(self.laneContours[numLane][0], (cx, cy), False)) for numLane in range(len(self.laneContours)): if distance[numLane] == 1: isIn[numLane] = True cv2.rectangle(res, (pX, pY), (pX+w, pY+h), (0, 255, 255), 2) if self.lanes[numLane]["is_empty"]: self.lanes[numLane]["is_empty"] = False self.lanes[numLane]["pts"].append((cx, cy)) else: self.lanes[numLane]["pts"].insert(0, (cx, cy)) break else: cv2.rectangle(res, (pX, pY), (pX+w, pY+h), (255, 255, 0), 2) for i in range(0, self.totalLane): if isIn[i]: if showVid: pass else: self.lanes[numLane]["is_empty"] = True self.lanes[numLane]["pts"] = [] if cntStatus: cv2.putText(res, 'lane1: '+str(totalCars[0]), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) cv2.putText(res, 'lane2: '+str(totalCars[1]), (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (125, 0, 255), 2) cv2.putText(res, 'truck/bus: '+str(self.typeCar["large"]), (400, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) cv2.putText(res, 'small car: '+str(self.typeCar["medium"]), (400, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) cv2.putText(res, 'motorcycle: '+str(self.typeCar["small"]), (400, 110), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) if showVid: resMask = cv2.bitwise_and(frame, frame, mask=~self.fgMask) cv2.imshow('frame', res) if cv2.waitKey(5) & 0xFF == ord('q'): cv2.imwrite('tesf.png', frameOrigin) cv2.imwrite('tesM.png', self.fgMask) break self.timer.cancel() update_progress(self.video_name, self.num_frame, self.total_frame) print totalCars self.video.release() cv2.destroyAllWindows() print self.typeCar
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if __name__ == '__main__': startTime = datetime.datetime.now() documentList = ["./texts/t11.txt","./texts/t22.txt"] # documentList = ["./texts/test_shak1.txt"] # documentList = ["./texts/shak.txt"] totalDocs = len(documentList) # Add language check on init and load correct stopwords list stopList = stopwords.words('english') # Init weighting libraries TfIdf = TfIdf(documentList, stopList) LSI = LSI(documentList, stopList) LDA = LDA(documentList, stopList) # Loop to get this argument print "Ready " while 1: try: line = sys.stdin.readline() print (TfIdf.runQuery(line)) print (LSI.runQuery(line)) print (LDA.runQuery(line)) except KeyboardInterrupt: break if not line: break
import pickle from lda import LDA from data.datafile import AADataFile dfile = pickle.load(open("data/datafile.pkl")) dt = dfile.DT te = dfile.TE lda = LDA(K=10, n_jobs=8, nr_em_epochs=20) perp, b, g = lda.fit(dt)