def test_parameters(): man_articles = article.db_get_all_articles("NOT trainingcluster = 0") flat_thread = EvalThread(clustering.ClusterMaker.FLAT, man_articles, 1) # aggl_thread = EvalThread(clustering.ClusterMaker.AGGL,man_articles,2,(19,22),(30,70)) flat_thread.start() # aggl_thread.start() '''
def db_load_articles(self, top_article_id, db=None): """ Loads all the ArticleModel instances relating to this cluster """ article_list = article.db_get_all_articles("`clusterid` = %d" % self.id) models,inv_index = articleunigram.db_load_models(article_list) self.articles = models.values() close_db = False if not db: close_db = True db = database.connect_to_database() cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor) terms_query = "SELECT `term`,`tf`,`count` FROM clusterswithterms WHERE `clusterid` = %d" % (self.id) # print str(threading.currentThread().getName()) + ": Going to load cluster model term from DB with query " + terms_query num_rows = cur.execute(terms_query) db_rows = cur.fetchall() # print str(threading.currentThread().getName()) + ": Loading terms for cluster " + str(self.id) + ": " + str(num_rows) + " rows..." self.model = articleunigram.ArticleModel() self.model.from_db_values(db_rows) cur.close() if close_db: db.close()
import MySQLdb.cursors from retrieval import article import sys __author__="Dan" __date__ ="$16-Mar-2010 21:40:24$" if __name__ == "__main__": todays_date = datetime.datetime.today() two_week_difference = datetime.timedelta(days=-14) week_difference = datetime.timedelta(days=-7) two_day_difference = datetime.timedelta(days=-3) week_ago_date_mysql = (todays_date + week_difference).strftime("%Y-%m-%d") days_ago_date_mysql = (todays_date + two_day_difference).strftime("%Y-%m-%d") article_list = article.db_get_all_articles("NOT isnull(clusterid) AND EXISTS (SELECT articleid FROM articleswithterms WHERE articleid = articles.id) AND `datepublished` >= '%(weekago)s'" % {"weekago" : week_ago_date_mysql}) models,inv_index = articleunigram.db_load_models(article_list) mergers_query = "SELECT * FROM proposedmergers WHERE `approved` = 1" db = database.connect_to_database() cur = db.cursor(cursorclass = MySQLdb.cursors.DictCursor) clusterer = ClusterMaker(inv_index = inv_index) cur.execute(mergers_query) result = cur.fetchall() proposed_mergers = dict() for row in result: clusters_query = "SELECT `clusterid` FROM proposedmergers_clusters WHERE `mergerid` = %d" % (row['id']) cur.execute(clusters_query) clusters = cur.fetchall() proposed_mergers[row['id']] = [clustermodel.ClusterModel(inv_index = inv_index,id = cl['clusterid'],load_full_article = True) for cl in clusters]
print "Setting cluster type to " + str(self.cluster_type) def get_cluster_info(self,e = None): if e: self.selected_cluster = int(self.lb_clusters.GetSelection(int(e.GetSelection()))) print str(self.selected_cluster) def start_clustering(self,e = None): if e: clusterer = clustering.ClusterMaker(cluster_type = self.cluster_type) cluster_thread = clustering.ClustererThread(clusterer,self.models,self.inv_index,float(self.tc_threshold.GetValue()),self.cluster_method,on_change=self.update) cluster_thread.start() # clusterer.cluster_articles(self.models,self.inv_index,float(self.tc_threshold.GetValue()),self.cluster_method,on_change=self.update) # clusterer.run(self.models,self.inv_index,float(self.tc_threshold.GetValue()),self.cluster_method,on_change=self.update,name = "whatever") def update(self,clusters=None,new_cluster=None,remove_model=None): if clusters: self.tc_num_clusters.SetValue(str(len(clusters))) self.tc_num_clusters.Update() self.clusters = clusters if new_cluster: self.lb_clusters.AppendAndEnsureVisible(str(new_cluster)) self.lb_clusters.Update() theVar = 1 app = wx.App(False) man_articles = article.db_get_all_articles("NOT trainingcluster = 0") models,inv_index = articleunigram.db_load_models(man_articles) frame = ClusterMonitor(None,'Article Browser',models,inv_index) app.MainLoop()
print " FP: " + str(evaluator.totalfalsepos) print " FN: " + str(evaluator.totalfalseneg) print "F-measure: " + str(evaluator.f_measure(4)) print "Purity: " + str(evaluator.purity(clusters, models)) print "NMI: " + str(evaluator.nmi(clusters, models)) time_taken[i] = (t2 - t1) print "Time elapsed: " + str(time_taken[i]) print "Saving..." evaluator.db_save(thresh, t_weight, l_weight, clustering.ClusterMaker.cluster_types[self.cluster_type], clustering.ClusterMaker.cluster_methods[cl_method], time_taken[i]) i += 1 print "Thread " + str(self.threadnum) + " complete, " + str(i) + " tests" if __name__ == '__main__': # test_parameters() t1 = time.time() man_articles = article.db_get_all_articles() models, inv_index = db_load_models(man_articles) # models, inv_index = count_terms_and_store(man_articles, store=True, title_weight=19, print_steps=True) clusterer = clustering.ClusterMaker(cluster_type=clustering.ClusterMaker.FLAT) clusters, models = clusterer.cluster_articles(models, inv_index, threshold=0.40, cluster_method=clustering.ClusterMaker.SNG_LNK) for cluster in clusters.values(): if cluster.db_save(): print "Cluster " + cluster.get_description() + " saved sucessfully" else: print "Cluster " + cluster.get_description() + " NOT saved sucessfully" t2 = time.time() ''' evaluator = Evaluator() print "Rand Index: " + str(evaluator.rand_index(clusters, models)) print " TP: " + str(evaluator.totaltruepos) print " TN: " + str(evaluator.totaltrueneg)
from retrieval import article from unigrammodel import termcounter from auxfunctions import chunks import time __author__="Dan" __date__ ="$16-Mar-2010 21:36:23$" def start_counting_with_threads(article_list, store=True, title_weight=19, print_steps=True, leading_weight=1, stoplist_file="stop_words", num_threads=1): chunk_sizes = int(float(len(article_list)) / float(num_threads)) article_lists = chunks.chunks(article_list,chunk_sizes) print "Counting " + str(len(article_lists)) + " article lists of size " + str(chunk_sizes) + " using " + str(num_threads) + " threads" term_counters = list() for a_list in article_lists: term_counters.append(termcounter.TermCounter(a_list,store,title_weight,print_steps,leading_weight,stoplist_file)) return term_counters if __name__ == "__main__": # test_parameters() t1 = time.time() article_list = article.db_get_all_articles("`source` = 'express'") # models,inv_index = db_load_models(man_articles) # models, inv_index = articleunigram.count_terms_and_store(article_list, store=True, title_weight=19, print_steps=True,stoplist_file="stop_words") term_counters = start_counting_with_threads(article_list, store=True, title_weight=19, print_steps=True,stoplist_file="stop_words",num_threads=2) for counter in term_counters: counter.start() """ print "Counted " + str(len(models)) + " articles, took", t2 = time.time() print str(t2 - t1) + " seconds." """
from retrieval import article from unigrammodel import termcounter from auxfunctions import chunks import time __author__="Dan" __date__ ="$16-Mar-2010 21:36:23$" def start_counting_with_threads(article_list, store=True, title_weight=19, print_steps=True, leading_weight=1, stoplist_file="stop_words", num_threads=1): chunk_sizes = int(float(len(article_list)) / float(num_threads)) article_lists = chunks.chunks(article_list,chunk_sizes) print "Counting " + str(len(article_lists)) + " article lists of size " + str(chunk_sizes) + " using " + str(num_threads) + " threads" term_counters = list() for a_list in article_lists: term_counters.append(termcounter.TermCounter(a_list,store,title_weight,print_steps,leading_weight,stoplist_file)) return term_counters if __name__ == "__main__": # test_parameters() t1 = time.time() article_list = article.db_get_all_articles("NOT EXISTS (SELECT articleid FROM articleswithterms WHERE articleid = articles.id)") # models,inv_index = db_load_models(man_articles) # models, inv_index = articleunigram.count_terms_and_store(article_list, store=True, title_weight=19, print_steps=True,stoplist_file="stop_words") term_counters = start_counting_with_threads(article_list, store=True, title_weight=19, print_steps=True,stoplist_file="stop_words",num_threads=4) for counter in term_counters: counter.start() """ print "Counted " + str(len(models)) + " articles, took", t2 = time.time() print str(t2 - t1) + " seconds." """