def getNewsSummariesForUser(user_id, page_num): page_num = int(page_num) begin_index = (page_num - 1) * news_list_batch_size end_index = page_num * news_list_batch_size # The final list of news to be returned. sliced_news = [] # personalizing: decide each class's number preferences = news_recommendation_service_client.getPreferenceForUser(user_id) news_numbers = [] if preferences is not None and len(preferences) > 0: news_numbers = [int(round(preference * news_limit)) for preference in preferences] print news_numbers if redis_client.get(user_id) is not None: news_digests = pickle.loads(redis_client.get(user_id)) # If begin_index is out of range, this will return empty list; # If end_index is out of range (begin_index is within the range), this # will return all remaining news ids. sliced_news_digests = news_digests[begin_index:end_index] db = mongodb_client.get_db() sliced_news = list(db[news_table_name].find({ 'digest': { '$in': sliced_news_digests } })) else: db = mongodb_client.get_db() print 'taking from db' # according to each class's number, take each class's amount of news, and sort them back together selected_news = [] for i in range(0, len(news_numbers)): selected_news.extend(list(db[news_table_name].find({ 'class': news_classes_v2.class_map[str(i+1)] }).limit(news_numbers[i]))) selected_news = sorted(selected_news, key=lambda k: k['publishedAt'], reverse=True)[:] # caching digests and paging selected_news_digests = map(lambda x:x['digest'], selected_news) redis_client.set(user_id, pickle.dumps(selected_news_digests)) redis_client.expire(user_id, user_news_time_out_in_seconds) sliced_news = selected_news[begin_index:end_index] # other taggings and returning for news in sliced_news: del news['text'] if news['publishedAt'].date() == datetime.today().date(): news['time'] = 'today' return json.loads(dumps(sliced_news))
def handle_message(msg): if msg is None or not isinstance(msg, dict) : return task = msg text = str(task['text']) if text is None: return # get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({'publishedAt': {'$gte': published_at_day_begin, '$lt': published_at_day_end}})) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [str(news['text']) for news in same_day_news_list] documents.insert(0, text) # Calculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim.A rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # Duplicated news. Ignore. print "Duplicated news. Ignore." return task['publishedAt'] = parser.parse(task['publishedAt']) db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def getPreferenceForUser(self, user_id): logging.info('news_recommendation_server: getting preference model for %s' % user_id) db = mongodb_client.get_db() model = db[preference_model_table_name].find_one({'userId':user_id}) # Note: createing new preference model for new users is migrated here, # because we need this model to decide the number of news sent to user in each class in backend # we can no longer wait for the click log processor to do that for us if model is None: print 'Creating preference model for new user: %s' % user_id logging.info('news_recommendation_server: creating preference model for %s' % user_id) new_model = {'userId' : user_id} preference = {} for i in news_classes_v2.classes: preference[i] = float(initial_p) new_model['preference'] = preference model = new_model db[preference_model_table_name].replace_one({'userId': user_id}, model, upsert=True) # another change is that: the whole list of preference is useful now, not only the top element preference_tuples = model['preference'].items() preference_list = [x[0] for x in preference_tuples] preference_value_list = [x[1] for x in preference_tuples] print preference_list print preference_value_list return preference_value_list
def test_basic(): db = client.get_db('test') db.demo.drop() assert db.demo.count() == 0 db.demo.insert({'test':123}) assert db.demo.count() == 1 db.demo.drop() assert db.demo.count() == 0 print 'test_basic passed!'
def handle_message(msg): if msg is None or not isinstance(msg, dict) : print 'broken msg' logging.error('news_recommendation_server: broken message in click log queue') return if ('userId' not in msg or 'newsId' not in msg or 'rate' not in msg or 'timestamp' not in msg): print 'wrong msg' logging.error('news_recommendation_server: wrong message in click log queue') return userId = msg['userId'] newsId = msg['newsId'] rate = msg['rate'] db = mongodb_client.get_db() model = db[preference_model_table_name].find_one({'userId': userId}) if (model is None): print 'user preference model not found' logging.error('news_recommendation_server: user preference model for %s not found' % userId) print 'Updating preference model for new user: %s' % userId logging.info('news_recommendation_server: Updating preference model for new user: %s' % userId) news = db[news_table_name].find_one({'digest': newsId}) if (news is None or 'class' not in news or news['class'] not in news_classes_v2.classes): print news is None print 'class' not in news print news['class'] not in news_classes_v2.classes print 'Skipping processing...' logging.error('news_recommendation_server: class for news: %s in click_task not defined' % newsId) return click_class = news['class'] print click_class print rate # do positive or negative time decay model according to user's rate: if(rate == '1'): print 'raising' logging.info('news_recommendation_server: raising %s news for %s' %(click_class, userId)) raise_class(click_class, model) elif(rate == '-1'): print 'depressing' logging.info('news_recommendation_server: depressing %s news for %s' %(click_class, userId)) depress_class(click_class, model) db[preference_model_table_name].replace_one({'userId': userId}, model, upsert=True) print 'processing finished' logging.info('news_recommendation_server: processing finished') print model['preference']
def test_basic(): db = client.get_db('test') db.testData.drop() assert db.testData.count() == 0 db.testData.insert({'test': 123}) db.testData.insert({'test': 223}) assert db.testData.count() == 2 ''' db.demo.drop() assert db.demo.count() == 0 ''' print 'test_basic passed!'
def test_basic(): db = client.get_db("test") db.test.drop() assert db.test.count() == 0 db.test.insert({"test": "test"}) assert db.test.count() == 1 db.test.drop() assert db.test.count() == 0 print("test basic passed.")
def handle_message(msg): if msg is None or not isinstance(msg, dict): return if ('userId' not in msg or 'newsId' not in msg): return userId = msg['userId'] newsId = msg['newsId'] # Update user's preference db = mongodb_client.get_db() model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId}) print model # If model not exists, create a new one if model is None: print 'Creating preference model for new user: %s' % userId new_model = {'userId': userId} preference = {} for i in news_classes.classes: preference[i] = float( INITIAL_P ) # set the value of the news class in dict 'preference' all to 0.17(INITIAL_P) new_model['preference'] = preference model = new_model print 'Updating preference model for new user: %s' % userId # Update model using time decaying method news = db[NEWS_TABLE_NAME].find_one({'digest': newsId}) if (news is None or 'class' not in news or news['class'] not in news_classes.classes): print news is None # print 'class' not in news # print news['class'] not in news_classes.classes print 'Skipping processing...' return click_class = news['class'] # get the class of clicked news # Update the clicked one. old_p = model['preference'][click_class] model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA) # Update not clicked classes. for i, prob in model['preference'].iteritems(): if not i == click_class: model['preference'][i] = float( (1 - ALPHA) * model['preference'][i]) db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId}, model, upsert=True)
def test_basic(): db = client.get_db('test') db.demo.drop() assert db.demo.count() == 0 db.demo.insert({'test': 123}) assert db.demo.count() == 1 db.demo.drop() assert db.demo.count() == 0 print 'test_basic passed!'
def handle_msg(msg): if msg is None or not isinstance(msg, dict): logging.error('Invalid click log message') return if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg): logging.error('Click log message does not contain necessary info') return userId = msg['userId'] newsId = msg['newsId'] # Update user's preference db = mongodb_client.get_db() model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId}) # If model does not exist, create a new one if model is None: logging.info('New user... Creating preference model for user: %s' % userId) new_model = {'userId': userId} preference = {} for i in NEWS_CLASSES: preference[i] = float(INITIAL_P) new_model['preference'] = preference model = new_model logging.info('Updating preference model for user: %s' % userId) # Update the model using time decay method news = db[NEWS_TABLE_NAME].find_one({'digest': newsId}) if (news is None or 'class' not in news or news['class'] not in NEWS_CLASSES): logging.error(news is None, exc_info=True) logging.error('class' not in news, exc_info=True) logging.error(news['class'] not in NEWS_CLASSES, exc_info=True) return click_class = news['class'] # Update the clicked one old_p = model['preference'][click_class] model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA) # Update not clicked ones for i, prob in model['preference'].iteritems(): if not i == click_class: model['preference'][i] = float( (1 - ALPHA) * model['preference'][i]) db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId}, model, upsert=True)
def test_basic(): db = client.get_db('test') db.test.drop() assert db.test.count() == 0 db.test.insert({'test': 1}) assert db.test.count() == 1 db.test.drop() assert db.test.count() == 0 print('test_basic passed.')
def handle_message(msg): if not isinstance(msg, dict): return if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg): return user_id = msg['userId'] news_id = msg['newsId'] # update user preference db = mongodb_client.get_db() model = db[PREFERENCE_TABLE_NAME].find_one({'userId': user_id}) if model is None: LOGGER.info('Creating preference model for new user: %s', user_id) new_model = {'userId': user_id} preference = {} for i in news_classes.classes: preference[i] = float(INIT_P) new_model['preference'] = preference model = new_model LOGGER.info('Updating preference model for user %s', user_id) # update model using time decying method news = db[NEWS_TABLE_NAME].find_one({'digest': news_id}) # news should have 'class' field # and the class should be in the news_class list if news is None: LOGGER.info('there is no news with digest: ' + news_id) return if 'class' not in news: LOGGER.info('there is no class for news.') return if news['class'] not in news_classes.classes: LOGGER.info('do not have a valid class: ' + news['class']) return click_class = news['class'] # update the clicked one old_p = model['preference'][click_class] model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA) # update note clied classes for i, _ in model['preference'].items(): if not i == click_class: model['preference'][i] = float( (1 - ALPHA) * model['preference'][i]) db[PREFERENCE_TABLE_NAME].replace_one({'userId': user_id}, model, upsert=True)
def get_news_summaries_for_user(user_id, page_num): page_num = int(page_num) if page_num <= 0: return [] begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE end_index = page_num * NEWS_LIST_BATCH_SIZE sliced_news = [] if redis_client.get(user_id) is not None: total_news_digests = pickle.loads(redis_client.get(user_id)) sliced_news_digests = total_news_digests[begin_index:end_index] db = mongodb_client.get_db() sliced_news = list(db[NEWS_TABLE_NAME].find({'digest':{'$in':sliced_news_digests}})) else: db = mongodb_client.get_db() total_news = list(db[NEWS_TABLE_NAME].find().sort([('publishedAt', -1)]).limit(NEWS_LIMIT)) total_news_digests = [x['digest'] for x in total_news] redis_client.set(user_id, pickle.dumps(total_news_digests)) redis_client.expire(user_id, USER_NEWS_TIME_OUT_IN_SECONDS) sliced_news = total_news[begin_index:end_index] # Get preference list for the user. # TODO: use preference to customize returned news list. preference = news_recommendation_service_client.getPreferenceForUser(user_id) topPreference = None if preference is not None and len(preference) > 0: topPreference = preference[0] for news in sliced_news: # Remove text field to save bandwidth. del news['text'] if news['publishedAt'].date() == datetime.today().date(): news['time'] = 'today' if news['class'] == topPreference: news['reason'] = 'Recommend' return json.loads(dumps(sliced_news))
def handle_message(msg): if msg is None or not isinstance(msg, dict): print('message is broken') return if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg): print ('bad message') print (msg) return userId = msg['userId'] newsId = msg['newsId'] # Update user's preference model. db = mongodb_client.get_db() model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId':userId}) # If model not exists, create a new one. if model is None: print("Creating preference model for new user: %s" % userId) new_model = {'userId':userId} preference = {} for i in news_classes.classes: preference[i] = float(INITIAL_P) new_model['preference'] = preference model = new_model # Update model using time decay method. news = db[NEWS_TABLE_NAME].find_one({'digest':newsId}) if (news is None or 'class' not in news or news['class'] not in news_classes.classes): print (news.keys()) if 'class' in news: print (news['class']) print('Skipping processing...') return click_class = news['class'] # Update the clicked one. old_p = model['preference'][click_class] model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA) # Update not clicked classes. for i, prob in model['preference'].items(): if not i == click_class: model['preference'][i] = float((1 - ALPHA) * model['preference'][i]) print (PREFERENCE_MODEL_TABLE_NAME) print (userId) print (model['preference']) db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId':userId}, model, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = task['text'] if text is None: return # get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'] for news in same_day_news_list] documents.insert(0, text) # Calculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # Duplicated news. Ignore. print "Duplicated news. Ignore." return task['publishedAt'] = parser.parse(task['publishedAt']) # Classify news. title = task['title'] if title is None: title = task['description'] topic = news_topic_modeling_service_client.classify(title) task['class'] = topic db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict) : return if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg): return userId = msg['userId'] newsId = msg['newsId'] # Update user's preference db = mongodb_client.get_db() model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId}) # If model not exists, create a new one if model is None: print 'Creating preference model for new user: %s' % userId new_model = {'userId' : userId} preference = {} for i in news_classes.classes: preference[i] = float(INITIAL_P) new_model['preference'] = preference model = new_model print 'Updating preference model for new user: %s' % userId # Update model using time decaying method news = db[NEWS_TABLE_NAME].find_one({'digest': newsId}) print(news) if (news is None or 'class' not in news or news['class'] not in news_classes.classes): print news is None print 'class' not in news print news['class'] not in news_classes.classes print 'Skipping processing...' return click_class = news['class'] # Update the clicked one. old_p = model['preference'][click_class] model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA) # Update not clicked classes. for i, prob in model['preference'].iteritems(): if not i == click_class: model['preference'][i] = float((1 - ALPHA) * model['preference'][i]) db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId}, model, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = task['text'] if text is None: return published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'] for news in same_day_news_list] documents.insert(0, text) tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim rows, _ = pairwise_sim.shape # 第一行跳过,判断第一列其余位置有没有大于阈值的, # 超过的话,说明是重复的文章 for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: print "Duplicated news. Ignore." return # mongodb时间条件查找,需要datetime类型,因此将字符串类型转换为datetime task['publishedAt'] = parser.parse(task['publishedAt']) # classify news title = task['title'] if title is not None: topic = news_topic_modeling_service_client.classify(title) task['class'] = topic # upsert=true,有的话直接覆盖,没有的话插入 db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def test_basic(): """Test MongoDB client basically""" database = client.get_db('test') database.test.drop() assert database.test.count() == 0 database.test.insert({'test': 1}) assert database.test.count() == 1 database.test.drop() assert database.test.count() == 0 print('test_basic passed!')
def getNewsSummariesForUser(user_id, page_num): page_num = int(page_num) begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE end_index = page_num * NEWS_LIST_BATCH_SIZE # The final list of news to be returned. sliced_news = [] if redis_client.get(user_id) is not None: news_digests = pickle.loads(redis_client.get(user_id)) # If begin_index is out of range, this will return empty list; # If end_index is out of range (begin_index is within the range), this # will return all remaining news ids. sliced_news_digests = news_digests[begin_index:end_index] print sliced_news_digests db = mongodb_client.get_db() sliced_news = list(db[NEWS_TABLE_NAME].find( {'digest': { '$in': sliced_news_digests }})) else: db = mongodb_client.get_db() total_news = list(db[NEWS_TABLE_NAME].find().sort([ ('publishedAt', -1) ]).limit(NEWS_LIMIT)) total_news_digests = map(lambda x: x['digest'], total_news) redis_client.set(user_id, pickle.dumps(total_news_digests)) redis_client.expire(user_id, USER_NEWS_TIME_OUT_IN_SECONDS) sliced_news = total_news[begin_index:end_index] for news in sliced_news: # Remove text field to save bandwidth. del news['text'] if news['publishedAt'].date() == datetime.today().date(): news['time'] = 'today' return json.loads(dumps(sliced_news))
def test_basic(): db = mongodb_client.get_db() db[PREFERENCE_MODEL_TABLE_NAME].delete_many({"userId": "test_user"}) msg = { "userId": "test_user", "newsId": "test_news", "timestamp": str(datetime.utcnow()) } click_log_processor.handle_message(msg) model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': 'test_user'}) assert model is not None assert len(model['preference']) == NUM_OF_CLASSES print('test_basic passed!')
def getNewsSummariesForUser(user_id,page_num): page_num = int(page_num) begin_index = (page_num - 1)*NEWS_LIST_BATCH_SIZE end_index = page_num*NEWS_LIST_BATCH_SIZE #The final list of news to be returnedself. sliced_news=[] if redis_client.get(user_id) is not None: news_digest = pickle.loads(redis_client.get(user_id)) #if begin_index is out of range,this will return empty list; #if end_index is out of range(begin_index is within the range),this will #return all remaining news ids sliced_news_digests = news_digest[begin_index:end_index] print sliced_news_digests db = mongodb_client.get_db() sliced_news = list(db[NEWS_TABLE_NAME].find({'digest':{'$in':sliced_news_digests}})) else: db = mongodb_client.get_db() total_news = list(db[NEWS_TABLE_NAME].find().sort([('publishedAt',-1)]).limit(NEWS_LIMIT)) total_news_digests = map(lambda x:x['digest'],total_news) redis_client.set(user_id,pickle.dumps(total_news_digests)) redis_client.expire(user_id,USER_NEWS_TIME_OUT_IN_SECONDS) sliced_news = total_news[begin_index:end_index] #Get preference for the user #preference = news_recommendation_service_client.getPreferenceForUser(user_id) #topPreference = None #if preference is not None and len(preference) > 0: # topPreference = preference[0] for news in sliced_news: del news['text'] if news['publishedAt'].date() == datetime.today().date(): news['time'] = 'today' return json.loads(dumps(sliced_news));
def logNewsClickForUser(user_id, news_id): LOG_CLICKS_TASK_QUEUE_URL = "amqp://*****:*****@emu.rmq.cloudamqp.com/evvloemh" LOG_CLICKS_TASK_QUEUE_NAME = "LOG_CLICKS_TASK_QUEUE" cloudAMQP_client = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL, LOG_CLICKS_TASK_QUEUE_NAME) message = {'userId': user_id, 'newsId': news_id, 'timestamp': datetime.utcnow()} db = mongodb_client.get_db() db[CLICK_LOGS_TABLE_NAME].insert(message) # Send log task to machine learning service for prediction message = {'userId': user_id, 'newsId': news_id, 'timestamp': str(datetime.utcnow())} cloudAMQP_client.sendMessage(message);
def test_basic(): db = client.get_db('test') # drop test table and guarantee the cleaness of db db.test.drop() assert db.test.count() == 0 db.test.insert({'test': 1}) assert db.test.count() == 1 db.test.drop() assert db.test.count() == 0 print('test_basic passed!')
def getPreferenceForUser(self, user_id): db = mongodb_client.get_db() model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'user_id': user_id}) #no user in the database if not model: return [] sorted_tuples = sorted(model['preference'].items(), key=operator.itemgetter(1), reverse=True) sorted_list = [item[0] for item in sorted_tuples] sorted_value_list = [item[1] for item in sorted_tuples] #make no sense if isclose(float(sorted_value_list[0]), float(sorted_value_list[-1])): return [] return sorted_list
def test_basic(): db = mongodb_client.get_db() db[PREFERENCE_MODEL_TABLE_NAME].delete_many({"userId": "test_user"}) msg = {"userId": "test_user", "newsId": "test_news", "timestamp": str(datetime.utcnow())} click_log_processor.handle_message(msg) model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': 'test_user'}) assert model is not None assert len(model['preference']) == NUM_OF_CLASSES print 'test_basic passed!'
def handle_message(msg): if not isinstance(msg, dict): logger.warning('message is broken') return text = msg['text'] if text is None: return # Get all recent news based on publishedAt published_at = parser.parse(msg['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'] for news in same_day_news_list] documents.insert(0, text) tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T logger.debug("Pairwise Sim:%s", str(pairwise_sim)) rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: logger.info("Duplicate news. Ignore!") return msg['publishedAt'] = parser.parse(msg['publishedAt']) description = msg['description'] if description is None: description = msg['title'] topic = news_topic_modeling_service_client.classify(description) msg['class'] = topic db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']}, msg, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = task['text'] if text is None: return published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'] for news in same_day_news_list] documents.insert(0, text) tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: print "Duplicated news. Ignore." return task['publishedAt'] = parser.parse(task['publishedAt']) # Classify news title = task['title'] if title is not None: # Need to uncomment these this line to call Machine Learning Server to get topic # topic = news_topic_modeling_service_client.classify(title) # task['class'] = topic task['class'] = "Politics" db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True) LOGGING_NEWS_DEDUPER.info('[x] Insert %s into MongoDB' % (task['title']))
def getNewsSummariesForUser(user_id, page_num): page_num = int(page_num) begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE end_index = page_num * NEWS_LIST_BATCH_SIZE slice_news = [] #redis to store user_id : digestList #for bson bson.dumps() if redis_client.get(user_id): #pickle.loads() from string to python object total_news_digest = pickle.loads(redis_client.get(user_id)) slice_news_digest = total_news_digest[begin_index:end_index] db = mongodb_client.get_db() slice_news = db[NEWS_TABLE_NAME].find( {'digest': { '$in': slice_news_digest }}) else: db = mongodb_client.get_db() total_news = list(db[NEWS_TABLE_NAME].find().sort([ ('publishedAt', -1) ]).limit(NEWS_LIMIT)) total_news_digest = [news['digest'] for news in total_news] redis_client.set(user_id, pickle.dumps(total_news_digest)) redis_client.expire(user_id, USER_NEWS_TIME_OUT_IN_SECONDES) slice_news = total_news[begin_index:end_index] #bson.dumps change from bson to json preferences = news_recommandation_client.getPreferenceForUser(user_id) #add the topPreference & today news topPreference = None if preferences and len(preferences) > 0: topPreference = preferences[0] today_date = str(datetime.now().date()) for news in slice_news: if 'class' in news and news['class'] == topPreference: news['reason'] = 'Recommand' if today_date in str(news['publishedAt']): news['time'] = 'today' return json.loads(dumps(slice_news))
def test_basic(): print('tst0 pass') db = client.get_db('tap-news') db.test.drop() assert db.test.count() == 0 print('tst0 pass') db.test.insert({'test': 1}) assert db.test.count() == 1 print('tst1 pass') db.test.drop() assert db.test.count() == 0 print('tst2 pass')
def test_basic(): """Test function""" try: database = client.get_db('test') database.test.drop() assert database.test.count() == 0 database.test.insert({"test": 1}) assert database.test.count() == 1 database.test.drop() assert database.test.count() == 0 print('test_basic passed.') except AssertionError as err: print("test failed!\n") raise err
def test_handle_message_basic(): db = mongodb_client.get_db() test_msg_1 = { "source": "Test 1", "title": "Test 1", "publishedAt": "2018-03-17T18:42:00Z", "digest":"test1", "text":"this is a test." } test_msg_2 = { "source": "Test 2", "title": "Test 2", "publishedAt": "2018-03-17T23:18:00Z", "digest":"test2", "text":"is this a test?" } test_msg_3 = { "source": "Test 3", "title": "Test 3", "publishedAt": "2018-03-17T23:18:00Z", "digest":"test3", "text":"this is a new test!" } none_message = None db[NEWS_TABLE_NAME].insert(test_msg_1) count = db[NEWS_TABLE_NAME].count() news_deduper.handle_message(none_message) count_1 = db[NEWS_TABLE_NAME].count() assert count_1 == count print("null check passed") news_deduper.handle_message(test_msg_1) count_2 = db[NEWS_TABLE_NAME].count() assert count_2 == count print("duplicate check passed") news_deduper.handle_message(test_msg_2) count_3 = db[NEWS_TABLE_NAME].count() assert count_3 == count news_deduper.handle_message(test_msg_3) count_4 = db[NEWS_TABLE_NAME].count() assert count_4 == count + 1 print("handle_message test passed") db[NEWS_TABLE_NAME].remove({"title": "Test 1"}) db[NEWS_TABLE_NAME].remove({"title": "Test 2"}) db[NEWS_TABLE_NAME].remove({"title": "Test 3"})
def handle_message(msg): # print('dedupter handling message', msg) if msg is None or not isinstance(msg, dict): print('News Deduper: message is broken') return task = msg if 'text' not in task or not task['text']: print('News Deduper publishedAt, not containing text') return if 'publishedAt' not in task or not task['publishedAt']: raise NotContainPublishTimeError published_at = parser.parse(task['publishedAt']) day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) day_end = day_begin + datetime.timedelta(days=1) news_collection = mongodb_client.get_db(DB_NAME).get_collection(COLLECTION_NAME) news_on_the_day = news_collection.find({ 'publishedAt': {'$gte': day_begin, '$lt': day_end} }) documents = [task['text']] documents.extend(news['text'] for news in news_on_the_day) tf_idf = TfidfVectorizer().fit_transform(documents) similarity_matrix = tf_idf * tf_idf.T # print('News Deduper', similarity_matrix) num_rows = similarity_matrix.shape[0] if any(similarity_matrix[0, i] > NEWS_SIMILARITY_THRESHOLD for i in range(1, num_rows)): print('News Deduper: similar document, throw it away') return # reformat the published date task['publishedAt'] = published_at # print('putting into database', task) # TODO: feature extraction should be same in backfill procedure # TODO actually should set another queue for classification if 'title' in task: try: task['category'] = classifier_client.classify(task['title']) except Exception as e: print("News Deduper: failed to classify using the classifier client", e) news_collection.replace_one({'digest': task['digest']}, task, upsert=True)
def basic_test(): """Basic test""" database = client.get_db() database.news.drop() assert database.news.count() == 0 news_deduper.handle_message(MESSAGE_1) assert database.news.count() == 1 news_deduper.handle_message(MESSAGE_2) assert database.news.count() == 1 news_deduper.handle_message(MESSAGE_3) assert database.news.count() == 1 database.news.drop() assert database.news.count() == 0 print('Basic test for deduper passed.')
def logNewsClickForUser(user_id, news_id, user_agent, news_category): print '[logNewsClickForUser]\n' print 'user_id:', user_id print 'news_id:', news_id print 'user_agent:', user_agent print 'news_category:', news_category # signup # user_agent update_user_agent(user_agent) # news category update_news_category(news_category) message = { 'userId': user_id, 'newsId': news_id, 'timestamp': datetime.utcnow() } db = mongodb_client.get_db() # save all log db[CLICK_LOGS_TABLE_NAME].insert(message) # save daily log day_click_logs_table_name = CLICK_LOGS_TABLE_NAME + datetime.today( ).strftime('_%Y-%m-%d') print 'table: ' + day_click_logs_table_name db[day_click_logs_table_name].insert(message) # count clickinng number evey hour update_hour_clicking_number() # update_daily_active_users update_daily_active_users(user_id) # update user freq update_daily_active_users_freq(user_id) # update item freq update_daily_active_news_freq(news_id) # Send log task to machine learning service for prediction message = { 'userId': user_id, 'newsId': news_id, 'timestamp': str(datetime.utcnow()) } cloudAMQP_client.send_message(message)
def test_basic(): #db=client.get_db('test') #这两句是等价的 db = client.get_db()['test'] db.test.drop() assert db.test.count() == 0 db.test.insert_one({'test': 1}) assert db.test.count() == 1 db.test.drop() assert db.test.count() == 0 print('test basic pass')
def getChatHistory(event_id): if event_id is None: print "None" return if redis_client.get(event_id) is not None: chat_list = pickle.loads(redis_client.get(event_id)) else: db = mongodb_client.get_db() chat_list = list(db['chat'].find({ "event_id": event_id }).sort([('time', -1)])) redis_client.set(event_id, pickle.dumps(chat_list)) redis_client.expire(event_id, 60) # 60s return json.loads(dumps(chat_list))
def test_basic(): # clear test documents db = client.get_db('test') db.test.drop() assert db.test.count() == 0 db.test.insert({'test': 1}) assert db.test.count() == 1 db.test.drop() assert db.test.count() == 0 print 'test_basic passed.'
import os import sys # classify news in mongodb to a specific class # import common package in parent directory sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import mongodb_client import news_topic_modeling_service_client if __name__ == '__main__': db = mongodb_client.get_db() cursor = db['news'].find({}) count = 0 for news in cursor: count += 1 print count if 'class' not in news: print 'Populating classes...' description = news['description'] if description is None: description = news['title'] topic = news_topic_modeling_service_client.classify(description) news['class'] = topic db['news-test'].replace_one({'digest': news['digest']}, news, upsert=True)