示例#1
0
def test_parameters():
    man_articles = article.db_get_all_articles("NOT trainingcluster = 0")
    flat_thread = EvalThread(clustering.ClusterMaker.FLAT, man_articles, 1)
    # aggl_thread = EvalThread(clustering.ClusterMaker.AGGL,man_articles,2,(19,22),(30,70))
    flat_thread.start()
    # aggl_thread.start()
    '''
示例#2
0
    def db_load_articles(self, top_article_id, db=None):
        """ Loads all the ArticleModel instances relating to this cluster """
        article_list = article.db_get_all_articles("`clusterid` = %d" % self.id)
        models,inv_index = articleunigram.db_load_models(article_list)
        self.articles = models.values()

        close_db = False
        if not db:
            close_db = True
            db = database.connect_to_database()
        cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor)
        terms_query = "SELECT `term`,`tf`,`count` FROM clusterswithterms WHERE `clusterid` = %d" % (self.id)
        # print str(threading.currentThread().getName()) + ": Going to load cluster model term from DB with query " + terms_query
        num_rows = cur.execute(terms_query)
        db_rows = cur.fetchall()
        # print str(threading.currentThread().getName()) + ": Loading terms for cluster " + str(self.id) + ": " + str(num_rows) + " rows..."
        self.model = articleunigram.ArticleModel()
        self.model.from_db_values(db_rows)
        cur.close()
        if close_db:
            db.close()
import MySQLdb.cursors
from retrieval import article
import sys

__author__="Dan"
__date__ ="$16-Mar-2010 21:40:24$"

if __name__ == "__main__":
    todays_date = datetime.datetime.today()
    two_week_difference = datetime.timedelta(days=-14)
    week_difference = datetime.timedelta(days=-7)
    two_day_difference = datetime.timedelta(days=-3)
    week_ago_date_mysql = (todays_date + week_difference).strftime("%Y-%m-%d")
    days_ago_date_mysql = (todays_date + two_day_difference).strftime("%Y-%m-%d")
    
    article_list = article.db_get_all_articles("NOT isnull(clusterid) AND EXISTS (SELECT articleid FROM articleswithterms WHERE articleid = articles.id) AND `datepublished` >= '%(weekago)s'" % {"weekago" : week_ago_date_mysql})
    models,inv_index = articleunigram.db_load_models(article_list)

    mergers_query = "SELECT * FROM proposedmergers WHERE `approved` = 1"
    db = database.connect_to_database()
    cur = db.cursor(cursorclass = MySQLdb.cursors.DictCursor)
    clusterer = ClusterMaker(inv_index = inv_index)
    cur.execute(mergers_query)
    result = cur.fetchall()
    proposed_mergers = dict()
    for row in result:
        clusters_query = "SELECT `clusterid` FROM proposedmergers_clusters WHERE `mergerid` = %d" % (row['id'])
        cur.execute(clusters_query)
        clusters = cur.fetchall()
        proposed_mergers[row['id']] = [clustermodel.ClusterModel(inv_index = inv_index,id = cl['clusterid'],load_full_article = True) for cl in clusters]
示例#4
0
            print "Setting cluster type to " + str(self.cluster_type)
    
    def get_cluster_info(self,e = None):
        if e:
            self.selected_cluster = int(self.lb_clusters.GetSelection(int(e.GetSelection())))
            print str(self.selected_cluster)
    
    def start_clustering(self,e = None):
        if e:
            clusterer = clustering.ClusterMaker(cluster_type = self.cluster_type)
            cluster_thread = clustering.ClustererThread(clusterer,self.models,self.inv_index,float(self.tc_threshold.GetValue()),self.cluster_method,on_change=self.update)
            cluster_thread.start()
            # clusterer.cluster_articles(self.models,self.inv_index,float(self.tc_threshold.GetValue()),self.cluster_method,on_change=self.update)
            # clusterer.run(self.models,self.inv_index,float(self.tc_threshold.GetValue()),self.cluster_method,on_change=self.update,name = "whatever")
            
    def update(self,clusters=None,new_cluster=None,remove_model=None):
        if clusters:
            self.tc_num_clusters.SetValue(str(len(clusters)))
            self.tc_num_clusters.Update()
            self.clusters = clusters
        if new_cluster:
            self.lb_clusters.AppendAndEnsureVisible(str(new_cluster))
            self.lb_clusters.Update()

theVar = 1

app = wx.App(False)
man_articles = article.db_get_all_articles("NOT trainingcluster = 0")
models,inv_index = articleunigram.db_load_models(man_articles)
frame = ClusterMonitor(None,'Article Browser',models,inv_index)
app.MainLoop()
示例#5
0
                        print "        FP:   " + str(evaluator.totalfalsepos)
                        print "        FN:   " + str(evaluator.totalfalseneg)
                        print "F-measure:    " + str(evaluator.f_measure(4))
                        print "Purity:       " + str(evaluator.purity(clusters, models))
                        print "NMI:          " + str(evaluator.nmi(clusters, models))
                        time_taken[i] = (t2 - t1)
                        print "Time elapsed: " + str(time_taken[i])
                        print "Saving..."
                        evaluator.db_save(thresh, t_weight, l_weight, clustering.ClusterMaker.cluster_types[self.cluster_type], clustering.ClusterMaker.cluster_methods[cl_method], time_taken[i])
                        i += 1
        print "Thread " + str(self.threadnum) + " complete, " + str(i) + " tests"
                    
if __name__ == '__main__':
    # test_parameters()
    t1 = time.time()
    man_articles = article.db_get_all_articles()
    models, inv_index = db_load_models(man_articles)
    # models, inv_index = count_terms_and_store(man_articles, store=True, title_weight=19, print_steps=True)
    clusterer = clustering.ClusterMaker(cluster_type=clustering.ClusterMaker.FLAT)
    clusters, models = clusterer.cluster_articles(models, inv_index, threshold=0.40, cluster_method=clustering.ClusterMaker.SNG_LNK)
    for cluster in clusters.values():
        if cluster.db_save():
            print "Cluster " + cluster.get_description() + " saved sucessfully"
        else:
            print "Cluster " + cluster.get_description() + " NOT saved sucessfully"
    t2 = time.time()
    '''
    evaluator = Evaluator()
    print "Rand Index: " + str(evaluator.rand_index(clusters, models))
    print "        TP: " + str(evaluator.totaltruepos)
    print "        TN: " + str(evaluator.totaltrueneg)
示例#6
0
from retrieval import article
from unigrammodel import termcounter
from auxfunctions import chunks
import time

__author__="Dan"
__date__ ="$16-Mar-2010 21:36:23$"

def start_counting_with_threads(article_list, store=True, title_weight=19, print_steps=True, leading_weight=1, stoplist_file="stop_words", num_threads=1):
    chunk_sizes = int(float(len(article_list)) / float(num_threads))
    article_lists = chunks.chunks(article_list,chunk_sizes)
    print "Counting " + str(len(article_lists)) + " article lists of size " + str(chunk_sizes) + " using " + str(num_threads) + " threads"
    term_counters = list()
    for a_list in article_lists:
        term_counters.append(termcounter.TermCounter(a_list,store,title_weight,print_steps,leading_weight,stoplist_file))
    return term_counters

if __name__ == "__main__":
    # test_parameters()
    t1 = time.time()
    article_list = article.db_get_all_articles("`source` = 'express'")
    # models,inv_index = db_load_models(man_articles)
    # models, inv_index = articleunigram.count_terms_and_store(article_list, store=True, title_weight=19, print_steps=True,stoplist_file="stop_words")
    term_counters = start_counting_with_threads(article_list, store=True, title_weight=19, print_steps=True,stoplist_file="stop_words",num_threads=2)
    for counter in term_counters:
        counter.start()
    """
    print "Counted " + str(len(models)) + " articles, took",
    t2 = time.time()
    print str(t2 - t1) + " seconds."
    """
from retrieval import article
from unigrammodel import termcounter
from auxfunctions import chunks
import time

__author__="Dan"
__date__ ="$16-Mar-2010 21:36:23$"

def start_counting_with_threads(article_list, store=True, title_weight=19, print_steps=True, leading_weight=1, stoplist_file="stop_words", num_threads=1):
    chunk_sizes = int(float(len(article_list)) / float(num_threads))
    article_lists = chunks.chunks(article_list,chunk_sizes)
    print "Counting " + str(len(article_lists)) + " article lists of size " + str(chunk_sizes) + " using " + str(num_threads) + " threads"
    term_counters = list()
    for a_list in article_lists:
        term_counters.append(termcounter.TermCounter(a_list,store,title_weight,print_steps,leading_weight,stoplist_file))
    return term_counters

if __name__ == "__main__":
    # test_parameters()
    t1 = time.time()
    article_list = article.db_get_all_articles("NOT EXISTS (SELECT articleid FROM articleswithterms WHERE articleid = articles.id)")
    # models,inv_index = db_load_models(man_articles)
    # models, inv_index = articleunigram.count_terms_and_store(article_list, store=True, title_weight=19, print_steps=True,stoplist_file="stop_words")
    term_counters = start_counting_with_threads(article_list, store=True, title_weight=19, print_steps=True,stoplist_file="stop_words",num_threads=4)
    for counter in term_counters:
        counter.start()
    """
    print "Counted " + str(len(models)) + " articles, took",
    t2 = time.time()
    print str(t2 - t1) + " seconds."
    """