Пример #1
0
def tf_idf():
    util.print_message('Start counting tf-idf...', debug=True)
    if not os.path.exists(settings.TFIDF_FILE_PATH):
        os.mkdir(settings.TFIDF_FILE_PATH)

    c = Calculator()
    file_names = util.get_file_list(settings.WORD_COUNT_FILE_PATH)
    for file_name in file_names:
        util.print_message('Processing tf-idf on {0}', arg=file_name)        
        c.tf_idf(file_name, None, None)
Пример #2
0

def push_to_mongo(db, message):
    try:
        record = db.arch.find_one({'url': message.url})
        if record is None:
            db.arch.insert_one(message.__dict__)
        else:
            util.print_message(
                '{0} already exists in mongo'.format(message.url))
    except:
        e = sys.exc_info()[0]
        util.print_message('Exception happened {0}'.format(e))


if __name__ == '__main__':
    calculator = Calculator()
    consumer = KafkaConsumer(settings.KAFKA_TOPIC,
                             bootstrap_servers=settings.KAFKA_SERVERS)
    db = connect_mongo()
    for m in consumer:
        p = util.json2obj(m.value)
        name = util.get_md5_hash(p.url)
        util.print_message(p.url)
        d = calculator.tf_idf(name + '.txt', p.url, p.body)
        tags = pick_tags(d)
        keywords = getattr(p, 'keywords', '')
        excerpt = getattr(p, 'excerpt', '')
        post = Post(p.url, p.title, p.body, keywords, excerpt, tags)
        push_to_mongo(db, post)