Exemplo n.º 1
0
def getNewsSummariesForUser(user_id, page_num):
    page_num = int(page_num)
    begin_index = (page_num - 1) * news_list_batch_size
    end_index = page_num * news_list_batch_size

    # The final list of news to be returned.
    sliced_news = []

    # personalizing: decide each class's number
    preferences = news_recommendation_service_client.getPreferenceForUser(user_id)
    news_numbers = []
    if preferences is not None and len(preferences) > 0:
        news_numbers = [int(round(preference * news_limit)) for preference in preferences]
    print news_numbers

    if redis_client.get(user_id) is not None:
        news_digests = pickle.loads(redis_client.get(user_id))
        # If begin_index is out of range, this will return empty list;
        # If end_index is out of range (begin_index is within the range), this
        # will return all remaining news ids.
        sliced_news_digests = news_digests[begin_index:end_index]
        db = mongodb_client.get_db()
        sliced_news = list(db[news_table_name].find({
            'digest': {
                '$in': sliced_news_digests
            }
        }))
    else:
        db = mongodb_client.get_db()
        print 'taking from db'
        # according to each class's number, take each class's amount of news, and sort them back together
        selected_news = []
        for i in range(0, len(news_numbers)):
            selected_news.extend(list(db[news_table_name].find({
                'class': news_classes_v2.class_map[str(i+1)]
            }).limit(news_numbers[i])))
        selected_news = sorted(selected_news, key=lambda k: k['publishedAt'], reverse=True)[:]
        
        # caching digests and paging
        selected_news_digests = map(lambda x:x['digest'], selected_news)
        redis_client.set(user_id, pickle.dumps(selected_news_digests))
        redis_client.expire(user_id, user_news_time_out_in_seconds)
        sliced_news = selected_news[begin_index:end_index]

    # other taggings and returning
    for news in sliced_news:
        del news['text']
        if news['publishedAt'].date() == datetime.today().date():
            news['time'] = 'today'
    return json.loads(dumps(sliced_news))
Exemplo n.º 2
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict) :
        return
    task = msg
    text = str(task['text'])
    if text is None:
        return

    # get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({'publishedAt': {'$gte': published_at_day_begin, '$lt': published_at_day_end}}))

    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [str(news['text']) for news in same_day_news_list]
        documents.insert(0, text)

        # Calculate similarity matrix
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print pairwise_sim.A

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                # Duplicated news. Ignore.
                print "Duplicated news. Ignore."
                return
    task['publishedAt'] = parser.parse(task['publishedAt'])
    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
Exemplo n.º 3
0
    def getPreferenceForUser(self, user_id):
        logging.info('news_recommendation_server: getting preference model for %s' % user_id)
        db = mongodb_client.get_db()
        model = db[preference_model_table_name].find_one({'userId':user_id})
        # Note: createing new preference model for new users is migrated here,
        # because we need this model to decide the number of news sent to user in each class in backend
        # we can no longer wait for the click log processor to do that for us
        if model is None:
            print 'Creating preference model for new user: %s' % user_id
            logging.info('news_recommendation_server: creating preference model for %s' % user_id)
            new_model = {'userId' : user_id}
            preference = {}
            for i in news_classes_v2.classes:
                preference[i] = float(initial_p)
            new_model['preference'] = preference
            model = new_model
            db[preference_model_table_name].replace_one({'userId': user_id}, model, upsert=True)
        
        # another change is that: the whole list of preference is useful now, not only the top element
        preference_tuples = model['preference'].items()
        preference_list = [x[0] for x in preference_tuples]
        preference_value_list = [x[1] for x in preference_tuples]

        print preference_list
        print preference_value_list
        return preference_value_list
Exemplo n.º 4
0
def test_basic():
    db = client.get_db('test')
    db.demo.drop()
    assert db.demo.count() == 0
    db.demo.insert({'test':123})
    assert db.demo.count() == 1
    db.demo.drop()
    assert db.demo.count() == 0
    print 'test_basic passed!'
Exemplo n.º 5
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict) :
        print 'broken msg'
        logging.error('news_recommendation_server: broken message in click log queue')
        return

    if ('userId' not in msg
        or 'newsId' not in msg
        or 'rate' not in msg
        or 'timestamp' not in msg):
        print 'wrong msg'
        logging.error('news_recommendation_server: wrong message in click log queue')
        return

    userId = msg['userId']
    newsId = msg['newsId']
    rate = msg['rate']

    db = mongodb_client.get_db()
    model = db[preference_model_table_name].find_one({'userId': userId})
    if (model is None):
        print 'user preference model not found'
        logging.error('news_recommendation_server: user preference model for %s not found' % userId)

    print 'Updating preference model for new user: %s' % userId
    logging.info('news_recommendation_server: Updating preference model for new user: %s' % userId)

    news = db[news_table_name].find_one({'digest': newsId})
    if (news is None
        or 'class' not in news
        or news['class'] not in news_classes_v2.classes):
        print news is None
        print 'class' not in news
        print news['class'] not in news_classes_v2.classes
        print 'Skipping processing...'
        logging.error('news_recommendation_server: class for news: %s in click_task not defined' % newsId)
        return

    click_class = news['class']
    print click_class
    print rate

    # do positive or negative time decay model according to user's rate:
    if(rate == '1'):
        print 'raising'
        logging.info('news_recommendation_server: raising %s news for %s' %(click_class, userId))
        raise_class(click_class, model)
    elif(rate == '-1'):
        print 'depressing'
        logging.info('news_recommendation_server: depressing %s news for %s' %(click_class, userId))
        depress_class(click_class, model)

    db[preference_model_table_name].replace_one({'userId': userId}, model, upsert=True)
    print 'processing finished'
    logging.info('news_recommendation_server: processing finished')
    print model['preference']
def test_basic():
    db = client.get_db('test')
    db.testData.drop()
    assert db.testData.count() == 0
    db.testData.insert({'test': 123})
    db.testData.insert({'test': 223})
    assert db.testData.count() == 2
    '''
    db.demo.drop()
    assert db.demo.count() == 0
    '''
    print 'test_basic passed!'
Exemplo n.º 7
0
def test_basic():
    db = client.get_db("test")
    db.test.drop()
    assert db.test.count() == 0

    db.test.insert({"test": "test"})
    assert db.test.count() == 1

    db.test.drop()
    assert db.test.count() == 0

    print("test basic passed.")
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return

    if ('userId' not in msg or 'newsId' not in msg):
        return

    userId = msg['userId']
    newsId = msg['newsId']

    # Update user's preference
    db = mongodb_client.get_db()
    model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId})
    print model
    # If model not exists, create a new one
    if model is None:
        print 'Creating preference model for new user: %s' % userId
        new_model = {'userId': userId}
        preference = {}
        for i in news_classes.classes:
            preference[i] = float(
                INITIAL_P
            )  # set the value of the news class in dict 'preference' all to 0.17(INITIAL_P)
        new_model['preference'] = preference
        model = new_model

    print 'Updating preference model for new user: %s' % userId

    # Update model using time decaying method
    news = db[NEWS_TABLE_NAME].find_one({'digest': newsId})
    if (news is None or 'class' not in news
            or news['class'] not in news_classes.classes):
        print news is None
        # print 'class' not in news
        # print news['class'] not in news_classes.classes
        print 'Skipping processing...'
        return

    click_class = news['class']  # get the class of clicked news

    # Update the clicked one.
    old_p = model['preference'][click_class]
    model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA)

    # Update not clicked classes.
    for i, prob in model['preference'].iteritems():
        if not i == click_class:
            model['preference'][i] = float(
                (1 - ALPHA) * model['preference'][i])

    db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId},
                                                model,
                                                upsert=True)
Exemplo n.º 9
0
def test_basic():
    db = client.get_db('test')
    db.demo.drop()
    assert db.demo.count() == 0

    db.demo.insert({'test': 123})
    assert db.demo.count() == 1

    db.demo.drop()
    assert db.demo.count() == 0

    print 'test_basic passed!'
Exemplo n.º 10
0
def handle_msg(msg):
    if msg is None or not isinstance(msg, dict):
        logging.error('Invalid click log message')
        return

    if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg):
        logging.error('Click log message does not contain necessary info')
        return

    userId = msg['userId']
    newsId = msg['newsId']

    # Update user's preference
    db = mongodb_client.get_db()
    model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId})

    # If model does not exist, create a new one
    if model is None:
        logging.info('New user... Creating preference model for user: %s' %
                     userId)
        new_model = {'userId': userId}
        preference = {}
        for i in NEWS_CLASSES:
            preference[i] = float(INITIAL_P)
        new_model['preference'] = preference
        model = new_model

    logging.info('Updating preference model for user: %s' % userId)

    # Update the model using time decay method
    news = db[NEWS_TABLE_NAME].find_one({'digest': newsId})
    if (news is None or 'class' not in news
            or news['class'] not in NEWS_CLASSES):
        logging.error(news is None, exc_info=True)
        logging.error('class' not in news, exc_info=True)
        logging.error(news['class'] not in NEWS_CLASSES, exc_info=True)
        return

    click_class = news['class']

    # Update the clicked one
    old_p = model['preference'][click_class]
    model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA)

    # Update not clicked ones
    for i, prob in model['preference'].iteritems():
        if not i == click_class:
            model['preference'][i] = float(
                (1 - ALPHA) * model['preference'][i])

    db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId},
                                                model,
                                                upsert=True)
def test_basic():
    db = client.get_db('test')
    db.test.drop()
    assert db.test.count() == 0

    db.test.insert({'test': 1})
    assert db.test.count() == 1

    db.test.drop()
    assert db.test.count() == 0

    print('test_basic passed.')
def handle_message(msg):
    if not isinstance(msg, dict):
        return

    if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg):
        return

    user_id = msg['userId']
    news_id = msg['newsId']

    # update user preference
    db = mongodb_client.get_db()
    model = db[PREFERENCE_TABLE_NAME].find_one({'userId': user_id})

    if model is None:
        LOGGER.info('Creating preference model for new user: %s', user_id)
        new_model = {'userId': user_id}
        preference = {}
        for i in news_classes.classes:
            preference[i] = float(INIT_P)
        new_model['preference'] = preference
        model = new_model

    LOGGER.info('Updating preference model for user %s', user_id)

    # update model using time decying method
    news = db[NEWS_TABLE_NAME].find_one({'digest': news_id})

    # news should have 'class' field
    # and the class should be in the news_class list
    if news is None:
        LOGGER.info('there is no news with digest: ' + news_id)
        return
    if 'class' not in news:
        LOGGER.info('there is no class for news.')
        return
    if news['class'] not in news_classes.classes:
        LOGGER.info('do not have a valid class: ' + news['class'])
        return

    click_class = news['class']
    # update the clicked one
    old_p = model['preference'][click_class]
    model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA)
    # update note clied classes
    for i, _ in model['preference'].items():
        if not i == click_class:
            model['preference'][i] = float(
                (1 - ALPHA) * model['preference'][i])

    db[PREFERENCE_TABLE_NAME].replace_one({'userId': user_id},
                                          model,
                                          upsert=True)
Exemplo n.º 13
0
def get_news_summaries_for_user(user_id, page_num):
    page_num = int(page_num)
    if page_num <= 0:
        return []

    begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE
    end_index = page_num * NEWS_LIST_BATCH_SIZE

    sliced_news = []

    if redis_client.get(user_id) is not None:
        total_news_digests = pickle.loads(redis_client.get(user_id))
        sliced_news_digests = total_news_digests[begin_index:end_index]
        db = mongodb_client.get_db()
        sliced_news = list(db[NEWS_TABLE_NAME].find({'digest':{'$in':sliced_news_digests}}))
    else:
        db = mongodb_client.get_db()
        total_news = list(db[NEWS_TABLE_NAME].find().sort([('publishedAt', -1)]).limit(NEWS_LIMIT))
        total_news_digests = [x['digest'] for x in total_news]
        redis_client.set(user_id, pickle.dumps(total_news_digests))
        redis_client.expire(user_id, USER_NEWS_TIME_OUT_IN_SECONDS)
        sliced_news = total_news[begin_index:end_index]

    # Get preference list for the user.
    # TODO: use preference to customize returned news list.
    preference = news_recommendation_service_client.getPreferenceForUser(user_id)
    topPreference = None
    
    if preference is not None and len(preference) > 0:
        topPreference = preference[0]
    
    for news in sliced_news:
        # Remove text field to save bandwidth.
        del news['text']
        if news['publishedAt'].date() == datetime.today().date():
            news['time'] = 'today'
        if news['class'] == topPreference:
            news['reason'] = 'Recommend'
        
    return json.loads(dumps(sliced_news))
Exemplo n.º 14
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print('message is broken')
        return

    if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg):
        print ('bad message')
        print (msg)
        return

    userId = msg['userId']
    newsId = msg['newsId']

    # Update user's preference model.
    db = mongodb_client.get_db()
    model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId':userId})

    # If model not exists, create a new one.
    if model is None:
        print("Creating preference model for new user: %s" % userId)
        new_model = {'userId':userId}
        preference = {}
        for i in news_classes.classes:
            preference[i] = float(INITIAL_P)
        new_model['preference'] = preference
        model = new_model

    # Update model using time decay method.
    news = db[NEWS_TABLE_NAME].find_one({'digest':newsId})

    if (news is None or 'class' not in news or news['class'] not in news_classes.classes):
        print (news.keys())
        if 'class' in news:
            print (news['class'])
        print('Skipping processing...')
        return

    click_class = news['class']

    # Update the clicked one.
    old_p = model['preference'][click_class]
    model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA)

    # Update not clicked classes.
    for i, prob in model['preference'].items():
        if not i == click_class:
            model['preference'][i] = float((1 - ALPHA) * model['preference'][i])

    print (PREFERENCE_MODEL_TABLE_NAME)
    print (userId)
    print (model['preference'])
    db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId':userId}, model, upsert=True)
Exemplo n.º 15
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = task['text']
    if text is None:
        return

    # get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))

    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [news['text'] for news in same_day_news_list]
        documents.insert(0, text)

        # Calculate similarity matrix
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print pairwise_sim

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                # Duplicated news. Ignore.
                print "Duplicated news. Ignore."
                return
    task['publishedAt'] = parser.parse(task['publishedAt'])

    # Classify news.
    title = task['title']
    if title is None:
        title = task['description']
    topic = news_topic_modeling_service_client.classify(title)
    task['class'] = topic

    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)
Exemplo n.º 16
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict) :
        return

    if ('userId' not in msg
        or 'newsId' not in msg
        or 'timestamp' not in msg):
        return

    userId = msg['userId']
    newsId = msg['newsId']

    # Update user's preference
    db = mongodb_client.get_db()
    model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId})

    # If model not exists, create a new one
    if model is None:
        print 'Creating preference model for new user: %s' % userId
        new_model = {'userId' : userId}
        preference = {}
        for i in news_classes.classes:
            preference[i] = float(INITIAL_P)
        new_model['preference'] = preference
        model = new_model

    print 'Updating preference model for new user: %s' % userId

    # Update model using time decaying method
    news = db[NEWS_TABLE_NAME].find_one({'digest': newsId})
    print(news)
    if (news is None
        or 'class' not in news
        or news['class'] not in news_classes.classes):
        print news is None
        print 'class' not in news
        print news['class'] not in news_classes.classes
        print 'Skipping processing...'
        return

    click_class = news['class']

    # Update the clicked one.
    old_p = model['preference'][click_class]
    model['preference'][click_class] = float((1 - ALPHA) * old_p + ALPHA)

    # Update not clicked classes.
    for i, prob in model['preference'].iteritems():
        if not i == click_class:
            model['preference'][i] = float((1 - ALPHA) * model['preference'][i])

    db[PREFERENCE_MODEL_TABLE_NAME].replace_one({'userId': userId}, model, upsert=True)
Exemplo n.º 17
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = task['text']
    if text is None:
        return

    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)
    db = mongodb_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))

    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [news['text'] for news in same_day_news_list]
        documents.insert(0, text)

        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print pairwise_sim

        rows, _ = pairwise_sim.shape

        # 第一行跳过,判断第一列其余位置有没有大于阈值的,
        # 超过的话,说明是重复的文章
        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                print "Duplicated news. Ignore."
                return

    # mongodb时间条件查找,需要datetime类型,因此将字符串类型转换为datetime
    task['publishedAt'] = parser.parse(task['publishedAt'])

    # classify news
    title = task['title']
    if title is not None:
        topic = news_topic_modeling_service_client.classify(title)
        task['class'] = topic

    # upsert=true,有的话直接覆盖,没有的话插入
    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)
Exemplo n.º 18
0
def test_basic():
    """Test MongoDB client basically"""
    database = client.get_db('test')
    database.test.drop()
    assert database.test.count() == 0

    database.test.insert({'test': 1})
    assert database.test.count() == 1

    database.test.drop()
    assert database.test.count() == 0

    print('test_basic passed!')
def getNewsSummariesForUser(user_id, page_num):
    page_num = int(page_num)
    begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE
    end_index = page_num * NEWS_LIST_BATCH_SIZE

    # The final list of news to be returned.
    sliced_news = []

    if redis_client.get(user_id) is not None:
        news_digests = pickle.loads(redis_client.get(user_id))

        # If begin_index is out of range, this will return empty list;
        # If end_index is out of range (begin_index is within the range), this
        # will return all remaining news ids.
        sliced_news_digests = news_digests[begin_index:end_index]
        print sliced_news_digests
        db = mongodb_client.get_db()
        sliced_news = list(db[NEWS_TABLE_NAME].find(
            {'digest': {
                '$in': sliced_news_digests
            }}))
    else:
        db = mongodb_client.get_db()
        total_news = list(db[NEWS_TABLE_NAME].find().sort([
            ('publishedAt', -1)
        ]).limit(NEWS_LIMIT))
        total_news_digests = map(lambda x: x['digest'], total_news)

        redis_client.set(user_id, pickle.dumps(total_news_digests))
        redis_client.expire(user_id, USER_NEWS_TIME_OUT_IN_SECONDS)

        sliced_news = total_news[begin_index:end_index]

    for news in sliced_news:
        # Remove text field to save bandwidth.
        del news['text']
        if news['publishedAt'].date() == datetime.today().date():
            news['time'] = 'today'
    return json.loads(dumps(sliced_news))
Exemplo n.º 20
0
def test_basic():
    db = mongodb_client.get_db()
    db[PREFERENCE_MODEL_TABLE_NAME].delete_many({"userId": "test_user"})
    msg = {
        "userId": "test_user",
        "newsId": "test_news",
        "timestamp": str(datetime.utcnow())
    }
    click_log_processor.handle_message(msg)
    model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': 'test_user'})
    assert model is not None
    assert len(model['preference']) == NUM_OF_CLASSES
    print('test_basic passed!')
Exemplo n.º 21
0
def getNewsSummariesForUser(user_id,page_num):
    page_num = int(page_num)
    begin_index = (page_num - 1)*NEWS_LIST_BATCH_SIZE
    end_index = page_num*NEWS_LIST_BATCH_SIZE

    #The final list of news to be returnedself.
    sliced_news=[]

    if redis_client.get(user_id) is not None:
        news_digest = pickle.loads(redis_client.get(user_id))

        #if begin_index is out of range,this will return empty list;
        #if end_index is out of range(begin_index is within the range),this will
        #return all remaining news ids
        sliced_news_digests = news_digest[begin_index:end_index]
        print sliced_news_digests
        db = mongodb_client.get_db()
        sliced_news = list(db[NEWS_TABLE_NAME].find({'digest':{'$in':sliced_news_digests}}))
    else:
        db = mongodb_client.get_db()
        total_news = list(db[NEWS_TABLE_NAME].find().sort([('publishedAt',-1)]).limit(NEWS_LIMIT))
        total_news_digests = map(lambda x:x['digest'],total_news)

        redis_client.set(user_id,pickle.dumps(total_news_digests))
        redis_client.expire(user_id,USER_NEWS_TIME_OUT_IN_SECONDS)

        sliced_news = total_news[begin_index:end_index]

    #Get preference for the user
    #preference = news_recommendation_service_client.getPreferenceForUser(user_id)
    #topPreference = None

    #if preference is not None and len(preference) > 0:
    #    topPreference = preference[0]
    for news in sliced_news:
        del news['text']
        if news['publishedAt'].date() == datetime.today().date():
            news['time'] = 'today'
    return json.loads(dumps(sliced_news));
Exemplo n.º 22
0
def logNewsClickForUser(user_id, news_id):

    LOG_CLICKS_TASK_QUEUE_URL = "amqp://*****:*****@emu.rmq.cloudamqp.com/evvloemh"
    LOG_CLICKS_TASK_QUEUE_NAME = "LOG_CLICKS_TASK_QUEUE"
    cloudAMQP_client = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL, LOG_CLICKS_TASK_QUEUE_NAME)
    message = {'userId': user_id, 'newsId': news_id, 'timestamp': datetime.utcnow()}

    db = mongodb_client.get_db()
    db[CLICK_LOGS_TABLE_NAME].insert(message)
    # Send log task to machine learning service for prediction
    message = {'userId': user_id, 'newsId': news_id, 'timestamp': str(datetime.utcnow())}

    cloudAMQP_client.sendMessage(message);
Exemplo n.º 23
0
def test_basic():
    db = client.get_db('test')
    # drop test table and guarantee the cleaness of db
    db.test.drop()
    assert db.test.count() == 0

    db.test.insert({'test': 1})
    assert db.test.count() == 1

    db.test.drop()
    assert db.test.count() == 0

    print('test_basic passed!')
Exemplo n.º 24
0
 def getPreferenceForUser(self, user_id):
     db = mongodb_client.get_db()
     model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'user_id': user_id})
     #no user in the database
     if not model:
         return []
     sorted_tuples = sorted(model['preference'].items(), key=operator.itemgetter(1), reverse=True)
     sorted_list = [item[0] for item in sorted_tuples]
     sorted_value_list = [item[1] for item in sorted_tuples]
     #make no sense
     if isclose(float(sorted_value_list[0]), float(sorted_value_list[-1])):
         return []
     return sorted_list
Exemplo n.º 25
0
def test_basic():
    db = mongodb_client.get_db()
    db[PREFERENCE_MODEL_TABLE_NAME].delete_many({"userId": "test_user"})

    msg = {"userId": "test_user", "newsId": "test_news", "timestamp": str(datetime.utcnow())}

    click_log_processor.handle_message(msg)

    model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': 'test_user'})
    assert model is not None
    assert len(model['preference']) == NUM_OF_CLASSES

    print 'test_basic passed!'
def handle_message(msg):
    if not isinstance(msg, dict):
        logger.warning('message is broken')
        return

    text = msg['text']
    if text is None:
        return

    # Get all recent news based on publishedAt
    published_at = parser.parse(msg['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))

    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [news['text'] for news in same_day_news_list]
        documents.insert(0, text)

        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        logger.debug("Pairwise Sim:%s", str(pairwise_sim))

        rows, _ = pairwise_sim.shape
        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                logger.info("Duplicate news. Ignore!")
                return

    msg['publishedAt'] = parser.parse(msg['publishedAt'])

    description = msg['description']
    if description is None:
        description = msg['title']

    topic = news_topic_modeling_service_client.classify(description)
    msg['class'] = topic

    db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']},
                                    msg,
                                    upsert=True)
Exemplo n.º 27
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = task['text']
    if text is None:
        return

    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)
    db = mongodb_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))

    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [news['text'] for news in same_day_news_list]
        documents.insert(0, text)

        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print pairwise_sim

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                print "Duplicated news. Ignore."
                return

    task['publishedAt'] = parser.parse(task['publishedAt'])

    # Classify news
    title = task['title']
    if title is not None:
        # Need to uncomment these this line to call Machine Learning Server to get topic
        # topic = news_topic_modeling_service_client.classify(title)
        # task['class'] = topic
        task['class'] = "Politics"

    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)
    LOGGING_NEWS_DEDUPER.info('[x] Insert %s into MongoDB' % (task['title']))
Exemplo n.º 28
0
def getNewsSummariesForUser(user_id, page_num):
    page_num = int(page_num)
    begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE
    end_index = page_num * NEWS_LIST_BATCH_SIZE
    slice_news = []
    #redis to store user_id : digestList
    #for bson bson.dumps()
    if redis_client.get(user_id):
        #pickle.loads() from string to python object
        total_news_digest = pickle.loads(redis_client.get(user_id))
        slice_news_digest = total_news_digest[begin_index:end_index]
        db = mongodb_client.get_db()
        slice_news = db[NEWS_TABLE_NAME].find(
            {'digest': {
                '$in': slice_news_digest
            }})
    else:
        db = mongodb_client.get_db()
        total_news = list(db[NEWS_TABLE_NAME].find().sort([
            ('publishedAt', -1)
        ]).limit(NEWS_LIMIT))
        total_news_digest = [news['digest'] for news in total_news]
        redis_client.set(user_id, pickle.dumps(total_news_digest))
        redis_client.expire(user_id, USER_NEWS_TIME_OUT_IN_SECONDES)
        slice_news = total_news[begin_index:end_index]
        #bson.dumps change from bson to json
    preferences = news_recommandation_client.getPreferenceForUser(user_id)
    #add the topPreference & today news
    topPreference = None
    if preferences and len(preferences) > 0:
        topPreference = preferences[0]
    today_date = str(datetime.now().date())
    for news in slice_news:
        if 'class' in news and news['class'] == topPreference:
            news['reason'] = 'Recommand'
        if today_date in str(news['publishedAt']):
            news['time'] = 'today'
    return json.loads(dumps(slice_news))
Exemplo n.º 29
0
def test_basic():
    print('tst0 pass')
    db = client.get_db('tap-news')
    db.test.drop()
    assert db.test.count() == 0
    print('tst0 pass')
    db.test.insert({'test': 1})
    assert db.test.count() == 1
    print('tst1 pass')

    db.test.drop()
    assert db.test.count() == 0

    print('tst2 pass')
Exemplo n.º 30
0
def test_basic():
    """Test function"""
    try:
        database = client.get_db('test')
        database.test.drop()
        assert database.test.count() == 0
        database.test.insert({"test": 1})
        assert database.test.count() == 1
        database.test.drop()
        assert database.test.count() == 0
        print('test_basic passed.')
    except AssertionError as err:
        print("test failed!\n")
        raise err
Exemplo n.º 31
0
def test_handle_message_basic():
    db = mongodb_client.get_db()
    test_msg_1 = {
      "source": "Test 1",
      "title": "Test 1",
      "publishedAt": "2018-03-17T18:42:00Z",
      "digest":"test1",
      "text":"this is a test."
    }
    test_msg_2 = {
      "source": "Test 2",
      "title": "Test 2",
      "publishedAt": "2018-03-17T23:18:00Z",
      "digest":"test2",
      "text":"is this a test?"
    }
    test_msg_3 = {
      "source": "Test 3",
      "title": "Test 3",
      "publishedAt": "2018-03-17T23:18:00Z",
      "digest":"test3",
      "text":"this is a new test!"
    }
    none_message = None
    
    db[NEWS_TABLE_NAME].insert(test_msg_1)
    count = db[NEWS_TABLE_NAME].count()

    news_deduper.handle_message(none_message)
    count_1 = db[NEWS_TABLE_NAME].count()
    assert count_1 == count
    print("null check passed")

    news_deduper.handle_message(test_msg_1)
    count_2 = db[NEWS_TABLE_NAME].count()
    assert count_2 == count
    print("duplicate check passed")

    news_deduper.handle_message(test_msg_2)
    count_3 = db[NEWS_TABLE_NAME].count()
    assert count_3 == count

    news_deduper.handle_message(test_msg_3)
    count_4 = db[NEWS_TABLE_NAME].count()
    assert count_4 == count + 1
    print("handle_message test passed")

    db[NEWS_TABLE_NAME].remove({"title": "Test 1"})
    db[NEWS_TABLE_NAME].remove({"title": "Test 2"})
    db[NEWS_TABLE_NAME].remove({"title": "Test 3"})
Exemplo n.º 32
0
def handle_message(msg):
    # print('dedupter handling message', msg)
    if msg is None or not isinstance(msg, dict):
        print('News Deduper: message is broken')
        return

    task = msg
    if 'text' not in task or not task['text']:
        print('News Deduper publishedAt, not containing text')
        return

    if 'publishedAt' not in task or not task['publishedAt']:
        raise NotContainPublishTimeError

    published_at = parser.parse(task['publishedAt'])
    day_begin = datetime.datetime(published_at.year,
                                  published_at.month,
                                  published_at.day,
                                  0, 0, 0, 0)
    day_end = day_begin + datetime.timedelta(days=1)

    news_collection = mongodb_client.get_db(DB_NAME).get_collection(COLLECTION_NAME)
    news_on_the_day = news_collection.find({
        'publishedAt': {'$gte': day_begin, '$lt': day_end}
    })

    documents = [task['text']]
    documents.extend(news['text'] for news in news_on_the_day)

    tf_idf = TfidfVectorizer().fit_transform(documents)
    similarity_matrix = tf_idf * tf_idf.T
    # print('News Deduper', similarity_matrix)

    num_rows = similarity_matrix.shape[0]
    if any(similarity_matrix[0, i] > NEWS_SIMILARITY_THRESHOLD for i in range(1, num_rows)):
        print('News Deduper: similar document, throw it away')
        return

    # reformat the published date
    task['publishedAt'] = published_at
    # print('putting into database', task)

    # TODO: feature extraction should be same in backfill procedure
    # TODO actually should set another queue for classification
    if 'title' in task:
        try:
            task['category'] = classifier_client.classify(task['title'])
        except Exception as e:
            print("News Deduper: failed to classify using the classifier client", e)
    news_collection.replace_one({'digest': task['digest']}, task, upsert=True)
Exemplo n.º 33
0
def basic_test():
    """Basic test"""
    database = client.get_db()
    database.news.drop()
    assert database.news.count() == 0
    news_deduper.handle_message(MESSAGE_1)
    assert database.news.count() == 1
    news_deduper.handle_message(MESSAGE_2)
    assert database.news.count() == 1
    news_deduper.handle_message(MESSAGE_3)
    assert database.news.count() == 1
    database.news.drop()
    assert database.news.count() == 0
    print('Basic test for deduper passed.')
Exemplo n.º 34
0
def logNewsClickForUser(user_id, news_id, user_agent, news_category):
    print '[logNewsClickForUser]\n'
    print 'user_id:', user_id
    print 'news_id:', news_id
    print 'user_agent:', user_agent
    print 'news_category:', news_category

    # signup

    # user_agent
    update_user_agent(user_agent)

    # news category
    update_news_category(news_category)

    message = {
        'userId': user_id,
        'newsId': news_id,
        'timestamp': datetime.utcnow()
    }

    db = mongodb_client.get_db()
    # save all log
    db[CLICK_LOGS_TABLE_NAME].insert(message)

    # save daily log
    day_click_logs_table_name = CLICK_LOGS_TABLE_NAME + datetime.today(
    ).strftime('_%Y-%m-%d')
    print 'table: ' + day_click_logs_table_name
    db[day_click_logs_table_name].insert(message)

    # count clickinng number evey hour
    update_hour_clicking_number()

    # update_daily_active_users
    update_daily_active_users(user_id)

    # update user freq
    update_daily_active_users_freq(user_id)

    # update item freq
    update_daily_active_news_freq(news_id)

    # Send log task to machine learning service for prediction
    message = {
        'userId': user_id,
        'newsId': news_id,
        'timestamp': str(datetime.utcnow())
    }
    cloudAMQP_client.send_message(message)
Exemplo n.º 35
0
def test_basic():
    #db=client.get_db('test')  #这两句是等价的
    db = client.get_db()['test']
    db.test.drop()

    assert db.test.count() == 0

    db.test.insert_one({'test': 1})
    assert db.test.count() == 1

    db.test.drop()
    assert db.test.count() == 0

    print('test basic pass')
Exemplo n.º 36
0
def getChatHistory(event_id):
    if event_id is None:
        print "None"
        return
    if redis_client.get(event_id) is not None:
        chat_list = pickle.loads(redis_client.get(event_id))
    else:
        db = mongodb_client.get_db()
        chat_list = list(db['chat'].find({
            "event_id": event_id
        }).sort([('time', -1)]))
        redis_client.set(event_id, pickle.dumps(chat_list))
        redis_client.expire(event_id, 60)  # 60s
    return json.loads(dumps(chat_list))
Exemplo n.º 37
0
def test_basic():

    # clear test documents
    db = client.get_db('test')
    db.test.drop()
    assert db.test.count() == 0

    db.test.insert({'test': 1})
    assert db.test.count() == 1

    db.test.drop()
    assert db.test.count() == 0

    print 'test_basic passed.'
Exemplo n.º 38
0
import os
import sys
# classify news in mongodb to a specific class
# import common package in parent directory
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import mongodb_client
import news_topic_modeling_service_client

if __name__ == '__main__':
    db = mongodb_client.get_db()
    cursor = db['news'].find({})
    count = 0
    for news in cursor:
        count += 1
        print count
        if 'class' not in news:
            print 'Populating classes...'
            description = news['description']
            if description is None:
                description = news['title']
            topic = news_topic_modeling_service_client.classify(description)
            news['class'] = topic
            db['news-test'].replace_one({'digest': news['digest']}, news, upsert=True)