def update_classifier(topic_id): from classifier import BinaryClassifier, doc2text db = get_db() cur = db.cursor(MySQLdb.cursors.DictCursor) query = ''' SELECT D.*, M.strength FROM docs D, docs2topics M WHERE M.doc_id = D.doc_id AND M.topic_id = {0} AND M.is_training = 1 ORDER BY D.found_date DESC LIMIT 100 ''' app.logger.debug(query) cur.execute(query.format(topic_id)) rows = cur.fetchall() docs = [doc2text(row) for row in rows] classes = [row['strength'] for row in rows] msg = '' if (0 in classes and 1 in classes): with Capturing() as output: clf = BinaryClassifier(topic_id) clf.train(docs, classes) clf.save() msg += '\n'.join(output) # We could reclassify all documents now, but we postpone this step # until the documents are actually displayed (which may be never # for sufficiently old ones). So we simply undefine the topic # strengths to mark that no classification has yet been made. query = "UPDATE docs2topics SET strength = NULL WHERE topic_id = {0} AND is_training < 1" app.logger.debug(query) cur.execute(query.format(topic_id)) db.commit() else: msg = "classifier not yet ready because only positive or negative training samples" return msg
def classify(rows, topic, topic_id): from classifier import BinaryClassifier, doc2text docs = [doc2text(row) for row in rows] with Capturing() as output: clf = BinaryClassifier(topic_id) clf.load() probs = clf.classify(docs) app.logger.debug('\n'.join(output)) db = get_db() cur = db.cursor() for i, (p_spam, p_ham) in enumerate(probs): app.logger.debug("doc {} classified for topic {}: {}".format( rows[i]['doc_id'], topic_id, p_ham)) query = ''' INSERT INTO docs2topics (doc_id, topic_id, strength) VALUES ({0},{1},{2}) ON DUPLICATE KEY UPDATE strength={2} ''' query = query.format(rows[i]['doc_id'], topic_id, p_ham) app.logger.debug(query) cur.execute(query) db.commit() return [p[1] for p in probs]
app.logger.error('cannot parse {}: {}'.format(post['url'], e)) continue if default_author: # overwrite whatever blogpostparser identified as the # author -- should probably make an exception for guest # posts: post['authors'] = default_author posts.append(post) if not posts: app.logger.warn('no posts to save') return 'OK' from classifier import BinaryClassifier, doc2text docs = [doc2text(post) for post in posts] clf = BinaryClassifier(0) # classifier 0 is for blogspam; note that 1=>blogspam, 0=>blogham clf.load() probs = clf.classify(docs) for i, (p_no, p_yes) in enumerate(probs): post = posts[i] app.logger.debug(u"post {} has blogspam probability {}".format(post['title'], p_yes)) if p_yes > app.config['MAX_SPAM'] * 3/2: app.logger.debug("> max {}".format(app.config['MAX_SPAM'] * 3/2)) continue post['status'] = 1 if p_yes < app.config['MAX_SPAM'] * 3/4 else 0 post['spamminess'] = p_yes post['meta_confidence'] = 0.75 query = "INSERT INTO docs ({}, found_date) VALUES ({} NOW())".format( ', '.join(post.keys()), '%s, '*len(post.keys())) app.logger.debug(query + ', '.join(map(unicode, post.values()))) try:
def main(): x = tf.placeholder(tf.float32, [None, AUDIO_FEATURE_SIZE]) y = tf.placeholder(tf.float32, [None, 2]) n_units = [100, 100, 50] n_batches = 10000 batch_size = 10 sound_event = 137 # Testing with sound event class of music. classifier = BinaryClassifier(x, y, n_units) path = "./trainingFeatures/bal_train/" filenames = [path + f for f in listdir(path)] """filenames = [path + "ZZ.tfrecord", path + "Zy.tfrecord", path + "ZY.tfrecord", path + "zz.tfrecord", path + "zZ.tfrecord", path + "uT.tfrecord", path + "Ut.tfrecord", path + "UT.tfrecord", path + "uu.tfrecord", path + "uU.tfrecord" ]""" eval_path = "./trainingFeatures/eval/" eval_filenames = [path + f for f in listdir(path)] batch = tf.train.batch(extract_example(filenames), batch_size, dynamic_pad=True) eval_batch = tf.train.batch(extract_example(eval_filenames), EVAL_SET_SIZE, dynamic_pad=True) with tf.Session() as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coordinator = tf.train.Coordinator() tf.train.start_queue_runners(sess, coordinator) # Train the model. for i in range(n_batches): labels, audio_features = sess.run(batch) targets = [] for j in range(batch_size): if sound_event in labels[j]: targets.append(POSITIVE) else: targets.append(NEGATIVE) sess.run(classifier.train, feed_dict={ x: audio_features, y: targets }) # Evaluate the model. labels, audio_features = sess.run(eval_batch) targets = [] for i in range(EVAL_SET_SIZE): if sound_event in labels[i]: targets.append(POSITIVE) else: targets.append(NEGATIVE) print( sess.run(classifier.f1_score, feed_dict={ x: audio_features, y: targets })) coordinator.request_stop() coordinator.join()