Exemplo n.º 1
0
def update_classifier(topic_id):
    from classifier import BinaryClassifier, doc2text
    db = get_db()
    cur = db.cursor(MySQLdb.cursors.DictCursor)
    query = '''
         SELECT D.*, M.strength
         FROM docs D, docs2topics M
         WHERE M.doc_id = D.doc_id AND M.topic_id = {0} AND M.is_training = 1
         ORDER BY D.found_date DESC
         LIMIT 100
    '''
    app.logger.debug(query)
    cur.execute(query.format(topic_id))
    rows = cur.fetchall()
    docs = [doc2text(row) for row in rows]
    classes = [row['strength'] for row in rows]
    msg = ''
    if (0 in classes and 1 in classes):
        with Capturing() as output:
            clf = BinaryClassifier(topic_id)        
            clf.train(docs, classes)
            clf.save()
        msg += '\n'.join(output)
        # We could reclassify all documents now, but we postpone this step
        # until the documents are actually displayed (which may be never
        # for sufficiently old ones). So we simply undefine the topic
        # strengths to mark that no classification has yet been made.
        query = "UPDATE docs2topics SET strength = NULL WHERE topic_id = {0} AND is_training < 1"
        app.logger.debug(query)
        cur.execute(query.format(topic_id))
        db.commit()
    else:
        msg = "classifier not yet ready because only positive or negative training samples"
    return msg
Exemplo n.º 2
0
def classify(rows, topic, topic_id):
    from classifier import BinaryClassifier, doc2text
    docs = [doc2text(row) for row in rows]
    with Capturing() as output:
        clf = BinaryClassifier(topic_id)
        clf.load()
        probs = clf.classify(docs)
    app.logger.debug('\n'.join(output))
    db = get_db()
    cur = db.cursor()
    for i, (p_spam, p_ham) in enumerate(probs):
        app.logger.debug("doc {} classified for topic {}: {}".format(
            rows[i]['doc_id'], topic_id, p_ham))
        query = '''
            INSERT INTO docs2topics (doc_id, topic_id, strength)
            VALUES ({0},{1},{2})
            ON DUPLICATE KEY UPDATE strength={2}
        '''
        query = query.format(rows[i]['doc_id'], topic_id, p_ham)
        app.logger.debug(query)
        cur.execute(query)
        db.commit()
    return [p[1] for p in probs]
Exemplo n.º 3
0
        except Exception, e:
            app.logger.error('cannot parse {}: {}'.format(post['url'], e))
            continue
        if default_author:
            # overwrite whatever blogpostparser identified as the
            # author -- should probably make an exception for guest
            # posts:
            post['authors'] = default_author
        posts.append(post)
        
    if not posts:
        app.logger.warn('no posts to save')
        return 'OK'

    from classifier import BinaryClassifier, doc2text
    docs = [doc2text(post) for post in posts]
    clf = BinaryClassifier(0) # classifier 0 is for blogspam; note that 1=>blogspam, 0=>blogham
    clf.load()
    probs = clf.classify(docs)
    for i, (p_no, p_yes) in enumerate(probs):
        post = posts[i]
        app.logger.debug(u"post {} has blogspam probability {}".format(post['title'], p_yes))
        if p_yes > app.config['MAX_SPAM'] * 3/2:
            app.logger.debug("> max {}".format(app.config['MAX_SPAM'] * 3/2))
            continue
        post['status'] = 1 if p_yes < app.config['MAX_SPAM'] * 3/4 else 0
        post['spamminess'] = p_yes
        post['meta_confidence'] = 0.75
        query = "INSERT INTO docs ({}, found_date) VALUES ({} NOW())".format(
            ', '.join(post.keys()), '%s, '*len(post.keys()))
        app.logger.debug(query + ', '.join(map(unicode, post.values())))