def db_load_models(articles): models = dict() inv_index = InvertedIndex() num_articles = len(articles) db = database.connect_to_database() list_of_ids = ",".join([str(article.id) for article in articles]) query = "SELECT articleid,term,tf,count FROM articleswithterms WHERE articleid IN (%s)" % (list_of_ids) cur = db.cursor(cursorclass=MySQLdb.cursors.SSDictCursor) num_results = cur.execute(query) rows = cur.fetchall() model_values = dict() for row in rows: if model_values.has_key(row['articleid']): model_values[row['articleid']].append(row) else: model_values[row['articleid']] = [row] for article, index in zip(articles, xrange(num_articles)): if article.has_been_counted: print "Loading article " + str(index + 1) + "/" + str(num_articles) new_model = ArticleModel(article) new_model.from_db_values(model_values[article.id]) all_terms = new_model.terms.all_terms() inv_index.add_term_occurences(all_terms, article.id) models[article.id] = new_model cur.close() db.close() return models, inv_index
def get_all_training_clusters(): db = connect_to_database() c = db.cursor() query = "SELECT * FROM training_clusters" c.execute(query) result = c.fetchall() return [TrainingCluster(tc[0],tc[1]) for tc in result]
def db_save(self,thresh,t_weight,l_weight,cl_type,cl_method,timeelapsed): db = database.connect_to_database() cur = db.cursor() query = "INSERT INTO clusterresults (`threshold`,`titleweight`, \ `clustermethod`,`leadingweight`, \ `clustertype`,`truepos`, \ `trueneg`,`falsepos`, \ `falseneg`,`fmeasure`, \ `purity`,`nmi`,`timeelapsed`,\ `rand`) \ VALUES(%.3f,%d,\ '%s',%d,\ '%s',%d,\ %d,%d,\ %d,%.4f,\ %.4f,%.4f,%.4f,\ %.4f)" % \ (thresh,t_weight,cl_method,l_weight,cl_type,\ self.totaltruepos,self.totaltrueneg,\ self.totalfalsepos,self.totalfalseneg,\ self.fmeasure,self.purity_measure,self.nmi_val, \ timeelapsed,self.rand) print query cur.execute(query) db.commit() cur.close()
def db_load_article(self,id): db = connect_to_database() cur = db.cursor(cursorclass = MySQLdb.cursors.DictCursor) query = "SELECT * FROM articles WHERE `id`='" + str(id) + "'" cur.execute(query) result = cur.fetchone() cur.close() return result
def proportion_of_topics_covered(self,source,start_date=None,end_date=None,db=None): if not db: db = database.connect_to_database() total_topics = len(self.all_clusters) articles_from_source = source.db_load_articles(start_date, end_date) clusters_of_source = set([article.cluster_id for article in source.article_list]) num_clusters_by_source = len(clusters_of_source) return float(num_clusters_by_source) / float(total_topics)
def get_all_sources(): db = database.connect_to_database() c = db.cursor() source_query = "SELECT id FROM sources" c.execute(source_query) results = c.fetchall() sources = [NewsSource(result[0],db) for result in results] return sources
def db_set_classification(self,cluster_id): self.classification = cluster_id db = connect_to_database() c = db.cursor() query = "UPDATE articles SET trainingcluster = " + str(cluster_id) + " WHERE articles.id = " + str(self.id) print "executing " + query c.execute(query) if db.affected_rows() != 0: return True else: return False
def db_create_row(self, db=None): if not db: db = database.connect_to_database() description = str(database.db_escape(self.top_article[0].article.title)) if not description or description == "NULL": description = "'None'" query = "INSERT INTO clusters (`description`,`toparticleid`) VALUES(%s,%d)" % (description,self.top_article[0].article.id) cur = db.cursor() cur.execute(query) self.id = cur.lastrowid db.commit() cur.close()
def db_load_counts(self, article_id=None, db=None): if not self.article and article_id: self.article = Article(id=article_id) self.terms = TermList() if self.article.has_been_counted: if not db: db = database.connect_to_database() cur = db.cursor(cursorclass=MySQLdb.cursors.SSDictCursor) query = "SELECT term,tf,count FROM articleswithterms WHERE articleid = %d" % (self.article.id) cur.execute(query) rows = cur.fetchall() self.terms.set_terms([(row['term'], (float(row['tf']), int(row['count']), None)) for row in rows]) return self.terms.all_terms()
def get_all_clusters(start_date=None, end_date=None, sql_conds=None, inv_index=None, num_threads=1): """ Gets all the clusters given some conditions """ db = database.connect_to_database() query = "SELECT * FROM clusterswitharticles" cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor) linker = "WHERE" if start_date: query += " %s `earliest` >= '%s'" % (linker, str(start_date)) linker = "AND" if end_date: query += " %s `latest` <= '%s'" % (linker, str(start_date)) linker = "AND" if sql_conds: query += " %s %s" % (linker, sql_conds) cur.execute(query) results = cur.fetchall() cur.close() db.close() chunk_sizes = int(float(len(results)) / float(num_threads)) cluster_lists = chunks.chunks(results, chunk_sizes) print "Loading " + str(len(cluster_lists)) + " cluster lists of size " + str(chunk_sizes) + " using " + str( num_threads ) + " threads" cluster_loaders = list() for c_list in cluster_lists: cluster_loaders.append(clusterloader.ClusterLoader(c_list, inv_index)) for loader in cluster_loaders: loader.start() # print "Getting as far as starting the threads" while threading.activeCount() > 1: print "Waiting for " + str(threading.activeCount() - 1) + " threads to finish:\n\t", for loader in cluster_loaders: print loader.getName() + ": cluster " + loader.get_current_cluster(), print "." time.sleep(5) clusters = dict() for loader in cluster_loaders: clusters.update(loader.get_clusters()) loader = None cluster_loaders = None """ num_clusters = len(results) for row, index in zip(results, xrange(num_clusters)): print "Loading cluster " + str(index) + "/" + str(num_clusters) new_cl = ClusterModel() new_cl.from_db_values(row,inv_index=inv_index) clusters[new_cl.id] = new_cl """ return clusters
def __init__(self,id=None,description=None): if id: if description: self.id = id self.description = description else: self.db = connect_to_database() # must be a cluster already on the database c = self.db.cursor() query = "SELECT * FROM training_clusters WHERE `id` = '" + str(id) + "'" c.execute(query) result = c.fetchone() if result: self.id = id self.description = result[1] elif description: self.db = connect_to_database() # obviously doesn't exist yet, let's make it self.description = description c = self.db.cursor() query = "INSERT INTO training_clusters(`description`) VALUES ('" + str(description) + "')" c.execute(query) self.id = c.lastrowid
def db_get_all_articles(sql_conditions=None): # we need a DictCursor for this one db = connect_to_database() cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor) query = "SELECT * FROM articles" if sql_conditions: # so basically sql_conditions needs to be in SQL format - none of that easy-to-use stuff # it just complicates things. query += " WHERE " + sql_conditions cur.execute(query) results = cur.fetchall() article_list = [Article(db_values = a) for a in results] cur.close() return article_list
def db_load(self, id, db=None): close_db = False if not db: close_db = True db = database.connect_to_database() query = "SELECT * FROM clusters WHERE `id` = %d" % (id) print query cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor) cur.execute(query) result = cur.fetchone() print result self.from_db_values(result) cur.close() self.db_load_articles(db) if close_db: db.close()
def db_save(self, db=None): if not db: db = database.connect_to_database() cur = db.cursor() orig_term_inserts = list() article_term_inserts = list() for term in self.terms: tf = self.terms[term][0] count = int(self.terms[term][1]) orig_terms = self.terms[term][2] # Check if the stem exists already query = "SELECT id FROM unigram_stems WHERE `term` = '" + term + "'" cur.execute(query) # if the term already exists we update, otherwise insert it if db.affected_rows() == 0: query = "INSERT INTO unigram_stems (`term`,`totalcount`) VALUES('%s',%d)" % (term, count) cur.execute(query) term_id = int(cur.lastrowid) else: row = cur.fetchone() term_id = int(row[0]) query = "UPDATE unigram_stems SET totalcount = totalcount + %d WHERE `id` = %d" % (count, term_id) cur.execute(query) # Now check if the original terms exist for orig_term in orig_terms: query = "SELECT * FROM orig_terms WHERE `term` = '%s'" % (orig_term) cur.execute(query) if db.affected_rows() == 0: orig_term_inserts.append("(%d,'%s')" % (term_id, orig_term)) # query = "INSERT INTO orig_terms (`stemid`,`term`) VALUES(%d,'%s')" % (term_id, orig_term) cur.execute(query) # Now make the link table row article_term_inserts.append("(%d,%d,%d,%.3f)" % (term_id, self.article.id, count, tf)) # query = "INSERT INTO article_terms (`stemid`,`articleid`,`count`,`tf`) VALUES(%d,%d,%d,%.3f) ON DUPLICATE KEY UPDATE `count` = VALUES(count), `tf` = VALUES(tf)" % (term_id, self.article.id, count, tf) # cur.execute(query) if len(orig_term_inserts) > 0: orig_term_query = "INSERT IGNORE INTO orig_terms (`stemid`,`term`) VALUES %s" % (",".join(orig_term_inserts)) cur.execute(orig_term_query) if len(article_term_inserts) > 0: article_term_query = "INSERT INTO article_terms (`stemid`,`articleid`,`count`,`tf`) VALUES %s ON DUPLICATE KEY UPDATE `count` = VALUES(count), `tf` = VALUES(tf)" % (",".join(article_term_inserts)) cur.execute(article_term_query) query = "UPDATE articles SET `counted` = 1 WHERE `id` = %d" % (self.article.id) cur.execute(query) cur.close() db.commit()
def db_load_top_article(self,toparticleid,inv_index=None): """ Loads the top article of the cluster from the DB """ # print str(threading.currentThread().getName()) + ": Loading top article..." if not self.inv_index: self.inv_index = inv_index db = database.connect_to_database() cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor) article_query = "SELECT * FROM articleswithterms WHERE `clusterid` = %d AND `articleid` = %d" % (int(self.id), int(toparticleid)) # print "Query is " + article_query cur.execute(article_query) db_values = cur.fetchall() article_model = articleunigram.ArticleModel() article_model.from_db_values(db_values) # print "TOP ARTICLE: " + str(article_model.article) comp = cosinecomparer.CosineComparer(self.inv_index) self.top_article = (article_model, self.similarity(comp, article_model, self.GRP_AVG))
def db_load_articles(self, top_article_id, db=None): """ Loads all the ArticleModel instances relating to this cluster """ article_list = article.db_get_all_articles("`clusterid` = %d" % self.id) models,inv_index = articleunigram.db_load_models(article_list) self.articles = models.values() close_db = False if not db: close_db = True db = database.connect_to_database() cur = db.cursor(cursorclass=MySQLdb.cursors.DictCursor) terms_query = "SELECT `term`,`tf`,`count` FROM clusterswithterms WHERE `clusterid` = %d" % (self.id) # print str(threading.currentThread().getName()) + ": Going to load cluster model term from DB with query " + terms_query num_rows = cur.execute(terms_query) db_rows = cur.fetchall() # print str(threading.currentThread().getName()) + ": Loading terms for cluster " + str(self.id) + ": " + str(num_rows) + " rows..." self.model = articleunigram.ArticleModel() self.model.from_db_values(db_rows) cur.close() if close_db: db.close()
def count_terms_and_store(articles, store=True, title_weight=19, print_steps=False, leading_weight=1, stoplist_file="../stop_words"): if store: db = database.connect_to_database() models = dict() inv_index = InvertedIndex() num_articles = len(articles) for art, index in zip(articles, xrange(len(articles))): if print_steps: print "Counting terms of article " + str(index + 1) + "/" + str(num_articles) model = ArticleModel(art, title_weight, leading_weight, stoplist_file=stoplist_file) for term in model.count_terms(): inv_index.add_term_ocurrence(term, model.article.id) if store: model.db_save(db) models[art.id] = model """ This isn't needed anymore total_counts = sum([model.total_term_counts for model in models.values()]) if store: cur = db.cursor() query = "UPDATE terms_global SET totaltermcounts = totaltermcounts + %d WHERE id = 1" % (total_counts) cur.execute(query) """ return models, inv_index
def db_save(self, db=None, suppress_save = False): """ Makes a number of queries. 1. First of all, sets the description and top article ID of the cluster. 2. Updates all the articles to have this cluster ID. 3. Update the cluster terms by looking for the terms in the articles and making an average of the counts and frequencies. Then, """ # First make sure that saving is required. If no articles have been added to this cluster, # then it does not need to be saved. if not self.edited: return True,"Not Edited" try: if not db: db = database.connect_to_database() cur = db.cursor() if self.id: cluster_up_query = "UPDATE clusters SET `description` = %s,`toparticleid` = %d WHERE `id` = %d" % (database.db_escape(self.top_article[0].article.title),self.top_article[0].article.id, self.id) else: self.db_create_row(db) # now we update the articles with their clusters # queries = [str(a_mod.get_cluster_update_query()) for a_mod in self.articles] # article_up_query = ";".join(queries) article_up_query = "UPDATE articles SET clusterid = %d WHERE id IN (%s)" % (self.id,",".join([str(a_mod.article.id) for a_mod in self.articles])) print "Running query " + article_up_query + " to update the articles in cluster " + str(self.id) success = True for term in self.model.terms: message = "Success!" # first get the stem id of the term query = "SELECT `id` FROM unigram_stems WHERE `term` = %s" % (database.db_escape(term)) cur.execute(query) result = cur.fetchone() term_id = result[0] tf = self.model.terms[term][0] count = int(self.model.terms[term][1]) # first check for the existence of the link #query = "SELECT `clusterid` FROM cluster_terms WHERE `stemid` = %d AND clusterid = %d" % (term_id,self.id) #print query #cur.execute(query) #result = cur.fetchone() #if not result: # Make the link table row query = "INSERT INTO cluster_terms (`stemid`,`clusterid`,`count`,`tf`) VALUES(%d,%d,%d,%.3f) ON DUPLICATE KEY UPDATE `count` = VALUES(`count`),`tf` = VALUES(`tf`)" % (term_id, self.id, count, tf) # print query #else: # query = "UPDATE cluster_terms SET `count` = %d, `tf` = %.3f WHERE `stemid` = %d AND %clusterid = %d" % (count,tf,term_id,self.id) cur.execute(query) cur.execute(article_up_query) cur.close() cur = db.cursor() if cluster_up_query: cur.execute(cluster_up_query) cur.close() except MySQLdb.Error, e: message = "Error %d: %s" % (e.args[0], e.args[1]) success = False
__author__="Dan" __date__ ="$16-Mar-2010 21:40:24$" if __name__ == "__main__": todays_date = datetime.datetime.today() two_week_difference = datetime.timedelta(days=-14) week_difference = datetime.timedelta(days=-7) two_day_difference = datetime.timedelta(days=-3) week_ago_date_mysql = (todays_date + week_difference).strftime("%Y-%m-%d") days_ago_date_mysql = (todays_date + two_day_difference).strftime("%Y-%m-%d") article_list = article.db_get_all_articles("NOT isnull(clusterid) AND EXISTS (SELECT articleid FROM articleswithterms WHERE articleid = articles.id) AND `datepublished` >= '%(weekago)s'" % {"weekago" : week_ago_date_mysql}) models,inv_index = articleunigram.db_load_models(article_list) mergers_query = "SELECT * FROM proposedmergers WHERE `approved` = 1" db = database.connect_to_database() cur = db.cursor(cursorclass = MySQLdb.cursors.DictCursor) clusterer = ClusterMaker(inv_index = inv_index) cur.execute(mergers_query) result = cur.fetchall() proposed_mergers = dict() for row in result: clusters_query = "SELECT `clusterid` FROM proposedmergers_clusters WHERE `mergerid` = %d" % (row['id']) cur.execute(clusters_query) clusters = cur.fetchall() proposed_mergers[row['id']] = [clustermodel.ClusterModel(inv_index = inv_index,id = cl['clusterid'],load_full_article = True) for cl in clusters] for p in proposed_mergers.items(): print str(p[0]) + ": merging " + ",".join([str(cl) for cl in p[1]]) + "..." new_cluster = p[1][0] clusters_to_merge = p[1][1:]
def db_save_cluster(self): db = database.connect_to_database() cur = db.cursor() cur.execute(self.get_cluster_update_query())