示例#1
0
def classify(rows, topic, topic_id):
    from classifier import BinaryClassifier, doc2text
    docs = [doc2text(row) for row in rows]
    with Capturing() as output:
        clf = BinaryClassifier(topic_id)
        clf.load()
        probs = clf.classify(docs)
    app.logger.debug('\n'.join(output))
    db = get_db()
    cur = db.cursor()
    for i, (p_spam, p_ham) in enumerate(probs):
        app.logger.debug("doc {} classified for topic {}: {}".format(
            rows[i]['doc_id'], topic_id, p_ham))
        query = '''
            INSERT INTO docs2topics (doc_id, topic_id, strength)
            VALUES ({0},{1},{2})
            ON DUPLICATE KEY UPDATE strength={2}
        '''
        query = query.format(rows[i]['doc_id'], topic_id, p_ham)
        app.logger.debug(query)
        cur.execute(query)
        db.commit()
    return [p[1] for p in probs]
示例#2
0
            continue
        if default_author:
            # overwrite whatever blogpostparser identified as the
            # author -- should probably make an exception for guest
            # posts:
            post['authors'] = default_author
        posts.append(post)
        
    if not posts:
        app.logger.warn('no posts to save')
        return 'OK'

    from classifier import BinaryClassifier, doc2text
    docs = [doc2text(post) for post in posts]
    clf = BinaryClassifier(0) # classifier 0 is for blogspam; note that 1=>blogspam, 0=>blogham
    clf.load()
    probs = clf.classify(docs)
    for i, (p_no, p_yes) in enumerate(probs):
        post = posts[i]
        app.logger.debug(u"post {} has blogspam probability {}".format(post['title'], p_yes))
        if p_yes > app.config['MAX_SPAM'] * 3/2:
            app.logger.debug("> max {}".format(app.config['MAX_SPAM'] * 3/2))
            continue
        post['status'] = 1 if p_yes < app.config['MAX_SPAM'] * 3/4 else 0
        post['spamminess'] = p_yes
        post['meta_confidence'] = 0.75
        query = "INSERT INTO docs ({}, found_date) VALUES ({} NOW())".format(
            ', '.join(post.keys()), '%s, '*len(post.keys()))
        app.logger.debug(query + ', '.join(map(unicode, post.values())))
        try:
            cur.execute(query, post.values())