예제 #1
0
 def db_load_counts(self, article_id=None, db=None):
     if not self.article and article_id:
         self.article = Article(id=article_id)
         self.terms = TermList()
     if self.article.has_been_counted:
         if not db:
             db = database.connect_to_database()
         cur = db.cursor(cursorclass=MySQLdb.cursors.SSDictCursor)
         query = "SELECT term,tf,count FROM articleswithterms WHERE articleid = %d" % (self.article.id)
         cur.execute(query)
         rows = cur.fetchall()
         self.terms.set_terms([(row['term'], (float(row['tf']), int(row['count']), None)) for row in rows])
         return self.terms.all_terms()
예제 #2
0
 def __init__(self, article=None, title_weight=19, leading_weight=1, normalizing_freq=True, stoplist_file=None):
     '''
     An articlemodel has a TermList which counts terms
     '''
     self.article = None
     if article:
         self.article = article
         if(stoplist_file):
             self.stoplist = Stoplist(stoplist_file)
         self.title_weight = title_weight
         self.leading_weight = leading_weight
     self.terms = TermList()
     self.total_term_counts = 0
     self.cluster_id = 0
     self.normalizing_freq = normalizing_freq
예제 #3
0
class ArticleModel(object):
    '''
    Represented by a vector
    '''

    def __init__(self, article=None, title_weight=19, leading_weight=1, normalizing_freq=True, stoplist_file=None):
        '''
        An articlemodel has a TermList which counts terms
        '''
        self.article = None
        if article:
            self.article = article
            if(stoplist_file):
                self.stoplist = Stoplist(stoplist_file)
            self.title_weight = title_weight
            self.leading_weight = leading_weight
        self.terms = TermList()
        self.total_term_counts = 0
        self.cluster_id = 0
        self.normalizing_freq = normalizing_freq
    
    def from_db_values(self, db_values):
        # print str(threading.currentThread().getName()) + ": has to load " + str(len(db_values)) + " terms"
        for row in db_values:
            self.from_db_row(row)
        # print str(threading.currentThread().getName()) + ": has finished with loading model of " + str(len(db_values)) + " terms"

    def from_db_row(self, db_row, load_article = True):
        if not self.article and db_row.has_key("articleid") and load_article:
            self.article = Article(id=db_row['articleid'])
            # print "Loaded article with id " + str(db_row['articleid']) + " therefore the article should be set: " + str(self.article)
        if db_row.has_key("term") and load_article:
            self.terms[db_row['term']] = (float(db_row['tf']), int(db_row['count']), None)
        
    def db_load_counts(self, article_id=None, db=None):
        if not self.article and article_id:
            self.article = Article(id=article_id)
            self.terms = TermList()
        if self.article.has_been_counted:
            if not db:
                db = database.connect_to_database()
            cur = db.cursor(cursorclass=MySQLdb.cursors.SSDictCursor)
            query = "SELECT term,tf,count FROM articleswithterms WHERE articleid = %d" % (self.article.id)
            cur.execute(query)
            rows = cur.fetchall()
            self.terms.set_terms([(row['term'], (float(row['tf']), int(row['count']), None)) for row in rows])
            return self.terms.all_terms()
    
    def set_cluster(self, cluster_id, save=False):
        self.cluster_id = cluster_id

    def get_cluster_update_query(self):
        return "UPDATE articles SET `clusterid` = %d WHERE `id` = %d" % (self.cluster_id, self.article.id)

    def db_save_cluster(self):
        db = database.connect_to_database()
        cur = db.cursor()
        cur.execute(self.get_cluster_update_query())
        
    def get_terms(self):
        return self.terms
    
    def set_terms(self, term_list):
        self.terms = term_list
    
    def count_terms(self, normalizing=True):
        re_words = re.compile(r"[a-z]+'?[a-z]+", re.IGNORECASE)
        article_text = termproc.replace_html_chars(self.article.article_text)
        
        terms = re_words.findall(article_text)
        title_terms = re_words.findall(self.article.title)
        
        terms = self.stoplist.apply(terms)
        title_terms = self.stoplist.apply(title_terms)
        
        self.total_term_counts = len(terms) + (len(title_terms) * self.title_weight)
        if not self.normalizing_freq:
            self.denominator = 1
        else:
            self.denominator = len(terms) + (len(title_terms) * self.title_weight)
            # print "Length: " + str(denominator)
        term_indices = xrange(len(terms))
        leading_threshold = 0.02
        leading_threshold = len(terms) * leading_threshold
        for term, i in zip(terms, term_indices):
            if i <= int(leading_threshold):
                weight = self.leading_weight
            else:
                weight = 1
            yield self.terms.count_term(term, self.denominator, weight)
        for term in title_terms:
            yield self.terms.count_term(term, self.denominator, self.title_weight)
            
    def copy(self):
        '''
        Creates an exact copy of the model given, only doesn't take the article
        model also.
        '''
        exact_copy = ArticleModel()
        exact_copy.set_terms(self.terms)
        return exact_copy
    
    def db_save(self, db=None):
        if not db:
            db = database.connect_to_database()
        cur = db.cursor()
        orig_term_inserts = list()
        article_term_inserts = list()
        for term in self.terms:
            tf = self.terms[term][0]
            count = int(self.terms[term][1])
            orig_terms = self.terms[term][2]
            # Check if the stem exists already
            query = "SELECT id FROM unigram_stems WHERE `term` = '" + term + "'"
            cur.execute(query)
            # if the term already exists we update, otherwise insert it
            if db.affected_rows() == 0:
                query = "INSERT INTO unigram_stems (`term`,`totalcount`) VALUES('%s',%d)" % (term, count)
                cur.execute(query)
                term_id = int(cur.lastrowid)
            else:
                row = cur.fetchone()
                term_id = int(row[0])
                query = "UPDATE unigram_stems SET totalcount = totalcount + %d WHERE `id` = %d" % (count, term_id)
                cur.execute(query)
            # Now check if the original terms exist
            for orig_term in orig_terms:
                query = "SELECT * FROM orig_terms WHERE `term` = '%s'" % (orig_term)
                cur.execute(query)
                if db.affected_rows() == 0:
                    orig_term_inserts.append("(%d,'%s')" % (term_id, orig_term))
                    # query = "INSERT INTO orig_terms (`stemid`,`term`) VALUES(%d,'%s')" % (term_id, orig_term)
                    cur.execute(query)
            # Now make the link table row
            article_term_inserts.append("(%d,%d,%d,%.3f)" % (term_id, self.article.id, count, tf))
            # query = "INSERT INTO article_terms (`stemid`,`articleid`,`count`,`tf`) VALUES(%d,%d,%d,%.3f) ON DUPLICATE KEY UPDATE `count` = VALUES(count), `tf` = VALUES(tf)" % (term_id, self.article.id, count, tf)
            # cur.execute(query)

        if len(orig_term_inserts) > 0:
            orig_term_query = "INSERT IGNORE INTO orig_terms (`stemid`,`term`) VALUES %s" % (",".join(orig_term_inserts))
            cur.execute(orig_term_query)
        if len(article_term_inserts) > 0:
            article_term_query = "INSERT INTO article_terms (`stemid`,`articleid`,`count`,`tf`) VALUES %s ON DUPLICATE KEY UPDATE `count` = VALUES(count), `tf` = VALUES(tf)" % (",".join(article_term_inserts))
            cur.execute(article_term_query)
        query = "UPDATE articles SET `counted` = 1 WHERE `id` = %d" % (self.article.id)
        cur.execute(query)
        cur.close()
        db.commit()
    
    def print_terms(self):
        self.terms.print_terms()
    
    def print_info(self):
        print "Article title: " + self.article.title
        print "     UNIQUE TERMS: " + str(len(self.terms))
        print " TOTAL TERM COUNT: " + str(self.total_term_counts)