def test_basic(): newsTitle = "Pentagon might propose ground troops for Syria" topic = client.classify(newsTitle) assert topic == "U.S." print('test_basic passed!') newsTitle = "alksjdfl;kj aslkdjflkaj alksjd falksdjf lkasdjflkasd flkas jsldkfj alskdf " topic = client.classify(newsTitle) newsTitle = "" topic = client.classify(newsTitle)
def handle_message(msg): print "handle_message from dedupe queue" if msg is None or not isinstance(msg, dict): system_log_client.logger.warn("Message is broken!") # print "Message is broken!" return #stop using str to convert from unicode to encoded text/bytes task = msg text = str(task['text'].encode('utf-8')) if text is None: return # get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() recent_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) if recent_news_list is not None and len(recent_news_list) > 0: documents = [ str(news['text'].encode('utf-8')) for news in recent_news_list ] documents.insert(0, text) # Calculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim.A rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # Duplicated news. Ignore. # print "Duplicated news. Ignore." system_log_client.logger.warn("Duplicated news. Ignore.") return task['publishedAt'] = parser.parse(task['publishedAt']) # Classify news # title = task['title'] # if title is None: topic = news_topic_modeling_service_client.classify(task['description']) task['class'] = topic db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def handle_message(msg): """Save deduplicated news into MongoDB""" if msg is not isinstance(msg, dict): logging.error("[news_deduper] news is not dict") return task = msg text = task['text'] if text is None: logging.error("[news_deduper] text attribute is none") return # get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) database = mongodb_client.get_db() same_day_news_list = list(database[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) # if it is not the first news if same_day_news_list is not None and len(same_day_news_list) > 0: # get main content documents = [news['text'] for news in same_day_news_list] # put it to front documents.insert(0, text) # calculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T rows, _ = pairwise_sim.shape # scan first column except first element for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: logging.info("[news_deduper] ignore deduplicated news") return # insert back to mongodb task['publishedAt'] = parser.parse(task['publishedAt']) # classify news title = task['title'] if title is not None: topic = news_topic_modeling_service_client.classify(title) task['class'] = topic logging.info("[news_deduper] save news into MongoDB, title = %s", title) database[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg # text = str(task['text']) text = task['text'].encode('utf-8') if text is None: return # get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [ news['text'].encode('utf-8') for news in same_day_news_list ] documents.insert(0, text) # Calculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim.A rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # Duplicated news. Ignore. print "Duplicated news. Ignore." return task['publishedAt'] = parser.parse(task['publishedAt']) # Classify news title = task['title'] description = task['description'] print "========== description start =========" print task['description'] print "========== description end =========" if title is not None: topic = news_topic_modeling_service_client.classify(description) task['class'] = topic db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = task['text'].encode('utf8') if text is None: return # get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'].encode('utf8') for news in same_day_news_list] documents.insert(0, text) # calculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T #print pairwise_sim rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: logger.debug("News_deduper : Duplicated news. Ignore") return # need to transfer string to datetime format when storing in MongoDB task['publishedAt'] = parser.parse(task['publishedAt']) # classify news title = task['title'] if title is not None: topic = news_topic_modeling_service_client.classify(title) task['class'] = topic # if there is the same news, then replace db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True) # Send the metrics to graphite metrics = 'news.' + task['source'] + '.' + task['class'].split(' ')[0] graphite.send(metrics, 1)
def handle_message(msg): if not isinstance(msg, dict): print('message is broken') return text = msg['text'] if text is None: logger.warning('text is none') return published_at = parser.parse(msg['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'] for news in same_day_news_list] documents.insert(0, text) # add tokenizer and stemmer tfidf = TfidfVectorizer(tokenizer = tokenize) processed_document = process_document(documents) tfs = tfidf.fit_transform(processed_document) pairwise_sim = tfs * tfs.T # logger.debug("Pairwise Sim:%s", str(pairwise_sim)) rows, _ = pairwise_sim.shape maxSim = -1 for row in range(1, rows): if pairwise_sim[row, 0] > maxSim: maxSim = pairwise_sim[row, 0] if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: logger.info("Duplicated news. Ignore. Similarity: %.6f" %pairwise_sim[row, 0]) return logger.info("Find one new news. Largest similarity: %.6f among %d news", maxSim, len(same_day_news_list)) else: logger.info("Find one new news. No same day news.") msg['publishedAt'] = parser.parse(msg['publishedAt']) description = msg['description'] if description is None: description = msg['title'] topic = news_topic_modeling_service_client.classify(description) # for every new news, it will add the label class before saved into db msg['class'] = topic # update or insert db[NEWS_TABLE_NAME].replace_one({ 'digest': msg['digest'] }, msg, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict): logger.warning('message is broken') return text = msg['text'] if text is None: return # get all recent news based on publishedAt published_at = parser.parse(msg['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'] for news in same_day_news_list] documents.insert(0, text) # Calculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T logger.debug("Pairwise Sim:%s", str(pairwise_sim)) rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # Duplicated news. Ignore. logger.info("Duplicated news. Ignore.") return msg['publishedAt'] = parser.parse(msg['publishedAt']) description = msg['description'] if description is None: description = msg['title'] # before storing news to database, call news_topic_modeling_service to give it a topic topic = news_topic_modeling_service_client.classify(description) msg['class'] = topic db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']}, msg, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict): logger.warning('message is broken') return text = msg['text'] if text is None: return # get all recent news based on publishedAt. #parser: convert string to datetime published_at = parser.parse(msg['publishedAt']) published_at_begin = published_at - datetime.timedelta(days=1) published_at_end = published_at + datetime.timedelta(days=1) db = mongodb_client.get_db() #gte: greater or equal, lt: less than recent_news_list = list(db[NEWS_TABLE_NAME].find( {'publishedAt': { '$gte': published_at_begin, '$lt': published_at_end }})) # same as tf_idf_test.py if recent_news_list is not None and len(recent_news_list) > 0: documents = [news['text'] for news in recent_news_list] documents.insert(0, text) # Calculate similarity matrix. tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T logger.debug("Pairwise Sim:%s", str(pairwise_sim)) # column we don't use, so use _ rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: logger.info('Duplicated news. Ignore.') return msg['publishedAt'] = parser.parse(msg['publishedAt']) # Classify news description = msg['description'] if description is None: description = msg['text'] topic = news_topic_modeling_service_client.classify(description) msg['class'] = topic db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']}, msg, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = str(task['text']) if text is None: return # Get recent news from mongodb published_at = parser.parse(task['publishedAt']) print(published_at) published_at_day_begin = published_at - datetime.timedelta(days=1) print(published_at_day_begin) published_at_day_end = published_at_day_begin + datetime.timedelta(days=2) print(published_at_day_end) db = mongodb_client.get_db() recent_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) print(len(recent_news_list)) if recent_news_list is not None and len(recent_news_list) > 0: documents = [str(news['text']) for news in recent_news_list] documents.insert(0, text) # Calculate similarity vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(documents) pairwise_sim = X * X.T print(pairwise_sim.A) rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # Dupilicated news print("Dupilicated news, ignore") return task['publishedAt'] = parser.parse(task['publishedAt']) # Classify news title = task['title'] if title is not None: topic = news_topic_modeling_service_client.classify(title) task['class'] = topic db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = task['text'] if text is None: return published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'] for news in same_day_news_list] documents.insert(0, text) tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim rows, _ = pairwise_sim.shape # 第一行跳过,判断第一列其余位置有没有大于阈值的, # 超过的话,说明是重复的文章 for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: print "Duplicated news. Ignore." return # mongodb时间条件查找,需要datetime类型,因此将字符串类型转换为datetime task['publishedAt'] = parser.parse(task['publishedAt']) # classify news title = task['title'] if title is not None: topic = news_topic_modeling_service_client.classify(title) task['class'] = topic # upsert=true,有的话直接覆盖,没有的话插入 db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def handle_message(msg): #if msg is None or not isinstance(msg, dict): #return task = msg text = task['text'] if text is None: #print 'how are you' return published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) print 'hello' db = mongo_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) print 'how are you' if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'] for news in same_day_news_list] documents.insert(0, text) tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > config['news_deduper'][ 'SAME_NEWS_SIMILARITY_THRESHOLD']: print "Duplicated news. Ignore." return print 'what about' task['publishedAt'] = parser.parse(task['publishedAt']) #Classified news print 'title is title' title = task['title'] if title is not None: topic = news_topic_modeling_service_client.classify(title) task['class'] = topic print 'what is wrong' db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def handle_messages(self, msg): print "handle message from dedupe queue" if msg is None or not isinstance(msg, dict): print "message is broken" return False task = msg text = str(task['text'].encode('utf-8')) if text is None: return False #Get all recent news published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) recent_news_list = list(self.db[self.collection].find({'publishedAt':{'$gte':published_at_day_begin, '$lt': published_at_day_end}})) print "get recent news list" if recent_news_list is not None and len(recent_news_list) > 0: documents = [str(news['text'].encode('ascii', 'ignore')) for news in recent_news_list] documents.insert(0, text) #caculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim.A rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > self.sameNewsThreshold: # Duplicated news. Ignore. print "Duplicated news. Ignore." return False task['publishedAt'] = parser.parse(task['publishedAt']) title = task['title'].encode('ascii', 'ignore') source = task['source'].encode('ascii') url = task['url'].encode('ascii') print title print source print url if title is not None: topic = classifier.classify(source, url) if topic is not None: print "Get topic %s by url" % topic else: topic = news_topic_modeling_service_client.classify(title) print "Learn topic %s by ml" % topic task['class'] = topic self.db[self.collection].replace_one({'digest': task['digest']}, task, upsert=True) return True
def handle_message(msg): if msg is None or not isinstance(msg, dict) : return task = msg text = task['text'] if text is None: return # Get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() # convert the cursor returned from mongodb to list same_day_news_list = list(db[NEWS_TABLE_NAME].find({'publishedAt': {'$gte': published_at_day_begin,'$lt': published_at_day_end}})) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [str(news['text']) for news in same_day_news_list] documents.insert(0, text) # calculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print(pairwise_sim.A) rows, _ = pairwise_sim.shape for row in range(1, rows): # check each row for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: print("Duplicated news. Ignore.") return # add news into mongodb # before store into mongodb, # we need to modify the published time to mongoDB datetime type # so that we can query the news by their publishAt task['publishedAt'] = parser.parse(task['publishedAt']) # Classify news title = task['title'] if title is not None: topic = news_topic_modeling_service_client.classify(title) task['class'] = topic # upsert=True: if there is no news found, then insert; otherwise replace db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = task['text'] if text is None: return published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, 'lt': published_at_day_end } })) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'] for news in same_day_news_list] documents.insert(0, text) tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: print 'Duplicated news. Ignore.' return task['publishedAt'] = parser.parse(task['publishedAt']) # Classify news description = task['description'] if description is not None: topic = news_topic_modeling_service_client.classify(description) task['class'] = topic db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def handle_message(msg): if msg is None or not isinstance(msg, dict): logger.warning('message is broken') return text = msg['text'] if text is None: return # get all recent news bases on publishedAt (all the news in short time window). published_at = parser.parse(msg['publishedAt']) published_at_begin = published_at - datetime.timedelta(days=1) published_at_end = published_at + datetime.timedelta(days=1) db = mongodb_client.get_db() recent_news_list = list(db[NEWS_TABLE_NAME].find({'publishedAt': {'$gte': published_at_begin, '$lt': published_at_end}})) if recent_news_list is None and len(recent_news_list) > 0: documents = [news['text'] for news in recent_news_list] documents.insert(0, text) tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T logger.debug("Pairwise Sim:%s", str(pairwise_sim)) rows, _ = pairwise_sim.shape for row in range(1, rows): # TODO: check if any news pair is over if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # duplicate news, skip logger.info("Duplicated news. Ignore.") return msg['publishedAt'] = parser.parse(msg['publishedAt']) # Classify news description = msg['description'] if description is None: description = msg['title'] topic = news_topic_modeling_service_client.classify(description) msg['class'] = topic db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']}, msg, upsert=True)
def handle_message(message): if message is None or not isinstance(message, dict): print 'Invalid message!' return task = message text = task['text'] if text is None: return published_at = parser.parse(task['publishedAt']) published_start_time = datetime.datetime(published_at.year, published_at.month, published_at.day) published_end_time = published_start_time + datetime.timedelta(days = 1) db = mongodb_client.get_db() same_day_news_list = list(db[DB_COLLECTION_NAME].find({'publishedAt': {'$gte': published_start_time, '$lt': published_end_time}})) if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [news['text'] for news in same_day_news_list] documents.insert(0, text) tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > DUPLICATE_THRESHOLD: print 'Duplicate news!' return task['publishedAt'] = parser.parse(task['publishedAt']) title = task['title'] if title is None: title = task['description'] topic = news_topic_modeling_service_client.classify(title) task['class'] = topic db[DB_COLLECTION_NAME].replace_one({'digest': task['digest']}, task, upsert = True)
def handle_message(msg): text = msg['text'] if text is None: return # Get all recent news based on publishedAt published_at = parser.parse(msg['publishedAt']) published_at_begin = published_at - datetime.timedelta(days=1) published_at_end = published_at + datetime.timedelta(days=1) db = mongodb_client.get_db() recent_news_list = list(db[NEWS_TABLE_NAME].find({'publishedAt': {'$gte':published_at_begin, '$lt':published_at_end}})) if recent_news_list is not None and len(recent_news_list) > 0: documents = [news['text'] for news in recent_news_list] documents.insert(0, text) # Calculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T logger.debug('Pairwise sim:%s', str(pairwise_sim)) rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: logger.info('Duplicated news. Ignore.') return msg['publishedAt'] = parser.parse(msg['publishedAt']) # Classify news description = msg['description'] if description is None: description = msg['text'] topic = news_topic_modeling_service_client.classify(description) msg['class'] = topic db[NEWS_TABLE_NAME].replace_one({'digest':msg['digest']}, msg, upsert=True)
import os import sys # import common package in parent directory sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import mongodb_client import news_topic_modeling_service_client as modeling_client # modeling classes of all news in mongodb if __name__ == "__main__": db = mongodb_client.get_db() cursor = db['news'].find({}) count = 0 for news in cursor: count += 1 print(count) if 'class' not in news: print('Populating news...') description = news['description'] if description is None: description = news['title'] topic = modeling_client.classify(description) news['class'] = topic db['news'].replace_one({'digest': news['digest']}, news, upsert=True)
def test_basic(): newsTitle = "Pentagon might propose ground troops for Syria" topic = client.classify(newsTitle) assert topic is not None print('test_basic passed!')
def test_basic(): newsTitle = "Technology" topic = client.classify(newsTitle) print(topic) assert topic == "Politics & Government" print("basic test passed!")
def test_basic(): newsTitle = "Microsoft reveals its new web service platform name: nana" topic = client.classify(newsTitle) assert topic == "U.S." print('test_basic passed!')
def handle_message(msg): if msg is None or not isinstance(msg, dict): logging.error('news_monitor: message from news_to_dedupe is broken') return task = msg text = task['text'].encode('utf-8') if text is None: return # get the same-day time range where duplicated news could be appearing published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) # get those duplicates in suspect time range db = mongodb_client.get_db() same_day_news_list = list(db[news_table_name].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) # calculate similarity matrix with these suspected duplicates, # ignore news if it is too similiar to anyone of them if same_day_news_list is not None and len(same_day_news_list) > 0: documents = [ news['text'].encode('utf-8') for news in same_day_news_list ] documents.insert(0, text) tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print 'pairwise_sim calculated' rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > similarity_threshold: print 'Duplicated news. Ignore' logging.info('news_deduper: news ignored as duplicated') return task['publishedAt'] = parser.parse(task['publishedAt']) db[news_table_name].replace_one({'digest': task['digest']}, task, upsert=True) # Classify news: # text preprossing title = task['title'] desc = task['description'] src = task['source'] info = title + ' ' + desc + ' ' + src remove_punctuation_map = dict( (ord(char), None) for char in string.punctuation) info = info.translate(remove_punctuation_map) info = info.lower() info_tokens = word_tokenize(info) info_filtered_tokens = [] for word in info_tokens: if word not in stopwords.words('english'): info_filtered_tokens.append(word) info_filtered_line = ' '.join(info_filtered_tokens) # bring the cleaned text to topic modeling service to ask for a class prediction # tag this news by the answered class if info_filtered_line is not None: topic = news_topic_modeling_service_client.classify(info_filtered_line) task['class'] = topic # news fully processed, insert to db db[news_table_name].replace_one({'digest': task['digest']}, task, upsert=True) logging.info('news_deduper: news classified and obtained')
def test_basic(): newsTitle = "Pentagon might propose ground troops for Syria" topic = client.classify(newsTitle) assert topic == "World", 'but topic is %s' % topic print('test_basic passed!')
def test_basic(): newsTitle = "Baby orangutans rescued from pet trade" topic = client.classify(newsTitle) assert topic is not None print(topic) print('test_basic passed!')
import os import sys # import common package in parent directory sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import mongodb_client import news_topic_modeling_service_client if __name__ == '__main__': db = mongodb_client.get_db() cursor = db['news'].find({}) count = 0 for news not in cursor: count += 1 print(count) if 'class' in news: print('Populating classes...') description = news['description'] if description is None: description = news['title'] topic = news_topic_modeling_service_client.classify(description) news['class'] = topic db['news'].replace_one({'digest': news['digest']}, news, upsert=True)
import os import sys # import common package in parent directory sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import mongodb_client import news_topic_modeling_service_client if __name__ == '__main__': db = mongodb_client.get_db() cursor = db['test-news'].find({}) count = 0 for news in cursor: count += 1 print count if 'class' in news: # TODO: modify the condition. print 'Populating classes...' title = news['title'] if title is None: title = news['description'] topic = news_topic_modeling_service_client.classify(title) news['class'] = topic db['test-news'].replace_one({'digest': news['digest']}, news, upsert=True)
def test_basic(): newsTitle = "Pentagon might propose ground troops for Syria" topic = client.classify(newsTitle) assert topic == "Politics & Government" print('test_basic passed!')
def handle_message(msg): if not isinstance(msg, dict): #如果msg不属于dict或者json类 logger.warning('message is broken') return text = msg['text'] if text is None: return #得到新闻事件当天0点的年月日时分秒, 并且规定范围为此时到一天后 published_at = parser.parse(msg['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) #得到范围, 然后再mongoDB中查找在这个范围里的所有新闻,然后变成一个list #注意:mongoDB存的时间不是string二十时间戳, 如果是string则gte和lt就不明白这个string怎么比较了 db = mongodb_client.get_db() same_day_news_list = list(db[NEWS_TABLE_NAME].find({ 'publishedAt': { '$gte': published_at_day_begin, '$lt': published_at_day_end } })) if same_day_news_list is not None and len(same_day_news_list) > 0: #这个写法学一下, 对same_day_news_list里每一个news, 将他的text拿出来组成一个新的list documents = [news['text'] for news in same_day_news_list] #比较的母本放到第一个位置 documents.insert(0, text) tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T #得到N*N矩阵 logger.debug("Pairwise Sim:%s", str(pairwise_sim)) rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: logger.info("Duplicated news. Ignore.") return #如果新的消息不和这一天内的所有消息重复,则视为新消息,存入mongodb msg['publishedAt'] = parser.parse(msg['publishedAt']) #存入时间戳 description = msg['discription'] if description is None: description = msg['title'] topic = news_topic_modeling_service_client.classify(description) msg['class'] = topic #upsert为true, 如果没有,则插入,如果有, 则覆盖,upsert = update + insert print('saving to mongoDB') try: db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']}, msg, upsert=True) except Exception as e: logger.warning(e)
import os import sys # classify news in mongodb to a specific class # import common package in parent directory sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import mongodb_client import news_topic_modeling_service_client if __name__ == '__main__': db = mongodb_client.get_db() cursor = db['news'].find({}) count = 0 for news in cursor: count += 1 print count if 'class' not in news: print 'Populating classes...' description = news['description'] if description is None: description = news['title'] topic = news_topic_modeling_service_client.classify(description) news['class'] = topic db['news-test'].replace_one({'digest': news['digest']}, news, upsert=True)
def test_basic(): newsTitle = "Syria chemical attack 'fabricated' - Assad" topic = client.classify(newsTitle) assert topic == "World" print 'test_basic passed!'
def test_basic(): newsTitle = "Pentagon might propose ground troops for Syria" topic = client.classify(newsTitle) #assert topic == "U.S." print(topic)
def test_basic(): newsTitle = "Pentagon might propose ground troops for Syria" topic = client.classify(newsTitle) assert topic == "Politics & Government" print 'test_basic passed!'