class NewsFetcher(): def __init__(self): with open(CONFIG_FILE, 'r') as f: data = json.load(f) self.dedupe_news_task_queue_url = data['queue'][ 'dedupeNewsTaskQueueUrl'] self.dedupe_news_task_queue_name = data['queue'][ 'dedupeNewsTaskQueueName'] self.scrape_news_task_queue_url = data['queue'][ 'scrapeNewsTaskQueueUrl'] self.scrape_news_task_queue_name = data['queue'][ 'scrapeNewsTaskQueueName'] self.sleep_time_in_seconds = int( data['queue']['fetchNewsTaskSleepTime']) def handle_message(self, msg): if msg is None or not isinstance(msg, dict): print "message is broken" return task = msg article = Article(task['url']) article.download() article.parse() # print article.text task['text'] = article.text self.dedupe_news_queue_client.sendMessage(task) def __call__(self): self.dedupe_news_queue_client = CloudAMQPClient( self.dedupe_news_task_queue_url, self.dedupe_news_task_queue_name) self.scrape_news_queue_client = CloudAMQPClient( self.scrape_news_task_queue_url, self.scrape_news_task_queue_name) #fetch msg from queue if self.scrape_news_queue_client is not None: while True: msg = self.scrape_news_queue_client.getMessage() if msg is not None: #handle message try: self.handle_message(msg) except Exception as e: print e pass self.scrape_news_queue_client.sleep( self.sleep_time_in_seconds) else: self.scrape_news_queue_client.close() self.dedupe_news_queue_client.close() break
class NewsMonitor: def __init__(self): with open(CONFIG_FILE, 'r') as f: data = json.load(f) self.scrape_news_task_queue_url = data['queue'][ 'scrapeNewsTaskQueueUrl'] self.scrape_news_task_queue_name = data['queue'][ 'scrapeNewsTaskQueueName'] self.redis_server_host = data['redis']['redisServerHost'] self.redis_server_port = int(data['redis']['redisServerPort']) self.news_timeout_redis_in_seconds = int( data['redis']['newsMonitorExpireInSeconds']) self.news_sources = list(data['newsApi']['source']) def __call__(self): self.redis_client = redis.StrictRedis(self.redis_server_host, self.redis_server_port) self.cloudAMQP_client = CloudAMQPClient( self.scrape_news_task_queue_url, self.scrape_news_task_queue_name) news_list = news_api_client.getNewsFromSource(self.news_sources) print "call news monitor" num_of_new_news = 0 num_of_old_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode('base-64') if self.redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%SZZ') self.redis_client.set(news_digest, news) self.redis_client.expire(news_digest, self.news_timeout_redis_in_seconds) self.cloudAMQP_client.sendMessage(news) else: num_of_old_news = num_of_old_news + 1 print "Fetched %d new news. %d old news in redis" % (num_of_new_news, num_of_old_news) self.cloudAMQP_client.close()
class NewsDeduper: def __init__(self): with open(CONFIG_FILE, 'r') as f: data = json.load(f) self.dedupe_news_task_queue_url = data['queue']['dedupeNewsTaskQueueUrl'] self.dedupe_news_task_queue_name = data['queue']['dedupeNewsTaskQueueName'] self.sleep_time_in_seconds = int(data['queue']['dedupeNewsTaskSleepTime']) self.collection = data['mongoDb']['newsMongoDbCollection'] self.sameNewsThreshold = float(data['newsDedupe']['sameNewsThreshold']) def handle_messages(self, msg): print "handle message from dedupe queue" if msg is None or not isinstance(msg, dict): print "message is broken" return False task = msg text = str(task['text'].encode('utf-8')) if text is None: return False #Get all recent news published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) recent_news_list = list(self.db[self.collection].find({'publishedAt':{'$gte':published_at_day_begin, '$lt': published_at_day_end}})) print "get recent news list" if recent_news_list is not None and len(recent_news_list) > 0: documents = [str(news['text'].encode('ascii', 'ignore')) for news in recent_news_list] documents.insert(0, text) #caculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim.A rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > self.sameNewsThreshold: # Duplicated news. Ignore. print "Duplicated news. Ignore." return False task['publishedAt'] = parser.parse(task['publishedAt']) title = task['title'].encode('ascii', 'ignore') source = task['source'].encode('ascii') url = task['url'].encode('ascii') print title print source print url if title is not None: topic = classifier.classify(source, url) if topic is not None: print "Get topic %s by url" % topic else: topic = news_topic_modeling_service_client.classify(title) print "Learn topic %s by ml" % topic task['class'] = topic self.db[self.collection].replace_one({'digest': task['digest']}, task, upsert=True) return True def __call__(self): self.cloudAMQP_client = CloudAMQPClient(self.dedupe_news_task_queue_url, self.dedupe_news_task_queue_name) self.db = mongodb_client.get_db() num_unique_news = 0 while True: if self.cloudAMQP_client is not None: msg = self.cloudAMQP_client.getMessage() if msg is not None: # Parse and process the task try: if self.handle_messages(msg): num_unique_news += 1 else: print "invalid msg" except Exception as e: print e pass self.cloudAMQP_client.sleep(self.sleep_time_in_seconds) else: print "Store %d unique news in mongoDb" % num_unique_news self.cloudAMQP_client.close() break