def tf_idf(): util.print_message('Start counting tf-idf...', debug=True) if not os.path.exists(settings.TFIDF_FILE_PATH): os.mkdir(settings.TFIDF_FILE_PATH) c = Calculator() file_names = util.get_file_list(settings.WORD_COUNT_FILE_PATH) for file_name in file_names: util.print_message('Processing tf-idf on {0}', arg=file_name) c.tf_idf(file_name, None, None)
def push_to_mongo(db, message): try: record = db.arch.find_one({'url': message.url}) if record is None: db.arch.insert_one(message.__dict__) else: util.print_message( '{0} already exists in mongo'.format(message.url)) except: e = sys.exc_info()[0] util.print_message('Exception happened {0}'.format(e)) if __name__ == '__main__': calculator = Calculator() consumer = KafkaConsumer(settings.KAFKA_TOPIC, bootstrap_servers=settings.KAFKA_SERVERS) db = connect_mongo() for m in consumer: p = util.json2obj(m.value) name = util.get_md5_hash(p.url) util.print_message(p.url) d = calculator.tf_idf(name + '.txt', p.url, p.body) tags = pick_tags(d) keywords = getattr(p, 'keywords', '') excerpt = getattr(p, 'excerpt', '') post = Post(p.url, p.title, p.body, keywords, excerpt, tags) push_to_mongo(db, post)