def test_basic(): client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME) sentMsg = {'test':'demo'} client.sendMessage(sentMsg) client.sleep(10) receivedMsg = client.getMessage() assert sentMsg == receivedMsg print 'test_basic passed!'
'techcrunch', 'the-new-york-times', 'the-wall-street-journal', 'the-washington-post' ] redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: # format: YYYY-MM-DDTHH:MM:SS in UTC news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') redis_client.set(news_digest, news) redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) cloudAMQP_client.sendMessage(news) print "Fetched %d new news." % num_of_new_news cloudAMQP_client.sleep(SLEEP_TIME_IN_SECOUNDS)
dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) scrape_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict): print 'message is broken' return task = msg article = Article(task['url']) article.download() article.parse() print article.text task['text'] = article.text dedupe_news_queue_client.sendMessage(task) while True: # fetch msg from queue if scrape_news_queue_client is not None: msg = scrape_news_queue_client.getMessage() if msg is not None: # Handle message try: handle_message(msg) except Exception as e: print e pass scrape_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
print 'message from news_to_scrape is broken' logging.error('news_fetcher: message from news_to_scrape is broken') return # use Newspaper to scrape the text of news task = msg text = None article = Article(task['url']) article.download() article.parse() task['text'] = article.text # send this news to mq dedupe_news_queue_client.sendMessage(task) logging.info('news_fetcher: news text scraped, loaded and sent to news_to_dedupe queue') while True: if scrape_news_queue_client is not None: msg = scrape_news_queue_client.getMessage() if msg is not None: logging.info('news_fetcher: news task aquired from news_to_scrape queue') try: handle_message(msg) except Exception as e: print 'news_fetcher exception: %s' % e logging.warning('news_fetcher: exception: %s' % e) pass scrape_news_queue_client.sleep(scrape_sleeptime_seconds) if dedupe_news_queue_client is not None: dedupe_news_queue_client.sleep(dedupe_sleeptime_seconds)
redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCE) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: # 2017-04-07T16:09:35Z formate: YYYY-MM-DDTHH:MM:SS in UTC news['publishedAt'] = datetime.datetime.utcnow().strftime( '%Y-%m-%dT%H:%M:%SZ') redis_client.set(news_digest, news) redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECOND) cloudAMQP_client.sendMessage(news) print "Fetched %d new news" % num_of_new_news cloudAMQP_client.sleep(SLEEP_IN_SECOND)
SCRAPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict): print('message is broken') return task = msg text = None article = Article(task['url']) article.download() article.parse() task['text'] = article.text dedupe_news_queue_client.sendMessage(task) while True: if scrape_news_queue_client is not None: msg = scrape_news_queue_client.getMessage() if msg is not None: #Parse and process the task try: handle_message(msg) except Exception as e: print(e) pass scrape_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
elif description is not None: topic = news_topic_modeling_service_client.classify(description) task['class'] = topic db[config['news_deduper']['NEWS_TABLE_NAME']].replace_one( {'digest': task['digest']}, task, upsert=True) # logging.basicConfig(level=logging.INFO, # format='%(asctime)s %(filename)s%(message)s', # datefmt='%a %d %b %Y %H:%M:%S' + ',', # filename='../logging/news_pipeline.log', # filemode='a') # logging.info(', ' + # 'event_name : ' + 'news_dedupe' + ', ' + # 'queue_name : ' + str(config['news_deduper']['DEDUPE_NEWS_TASK_QUEUE_NAME']) + ', ' + # 'news_id : ' + str(task['digest'])) while True: if cloudAMQP_client is not None: msg = cloudAMQP_client.getMessage() if msg is not None: # Parse and process the task try: handle_message(msg) except Exception as e: print e pass cloudAMQP_client.sleep(config['news_deduper']['SLEEP_TIME_IN_SECONDS'])
# Duplicated news. Ignore. print "Duplicated news. Ignore." return task['publishedAt'] = parser.parse(task['publishedAt']) # Classify news title = task['title'] if title is not None: topic = news_topic_modeling_service_client.classify(title) task['class'] = topic print "add one news" db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True) while True: if cloudAMQP_client is not None: msg = cloudAMQP_client.getMessage() if msg is not None: # Parse and process the task try: handle_message(msg) except Exception as e: print e pass cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: print 'duplicate news. ignore' return task['publishedAt'] = parser.parse(task['publishedAt']) # classify new news as it's being deduped. if task['description'] is None: task['description'] = task['title'] if task['title'] is not None: topic = news_topic_modeling_service_client.classify(task['description']) task['class'] = topic db[MONGODB_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True) while True: if dedupe_queue_client is not None: msg = dedupe_queue_client.getMessage() if msg is not None: # parse and proceed with task try: handle_message(msg) except Exception as e: print 'error while handling message in deduper: %s' % e pass dedupe_queue_client.sleep(SLEEP_TIMEOUT_IN_SECONDS)
DEDUPE_TASK_QUEUE_NAME = config['cloudAMQP']['deduperTaskQueue']['name'] SLEEP_IN_SECONDS = config['cloudAMQP']['scraperTaskQueue']['sleep'] scrape_task_mq_client = CloudAMQPClient(SCRAPE_TASK_QUEUE_URL, SCRAPE_TASK_QUEUE_NAME) dedupe_task_mq_client = CloudAMQPClient(DEDUPE_TASK_QUEUE_URL, DEDUPE_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict): print('message is invalid') return task = msg article = Article(task['url']) article.download() article.parse() task['text'] = article.text dedupe_task_mq_client.sendMessage(task) while True: if scrape_task_mq_client is not None: msg = scrape_task_mq_client.getMessage() if msg is not None: try: handle_message(msg) except Exception as ex: print(ex) pass scrape_task_mq_client.sleep(SLEEP_IN_SECONDS)
# if news['source'] != 'cnn': # print "News Source is not CNN, cannot handle!" # else: # print 'scrape cnn news' # text = CNN.extract_news_text(news['url']) # scraper news via newspaper API article = Article(news['url']) article.download() # == request.get article.parse() text = article.text.encode('utf-8') news_task['text'] = text DEDEUPER_MQ_CLIENT.send_message(news_task) print "[x] Sent msg to %s : %s" % (DEDUP_QUEUE_NAME, text) print news['url'] while True: if NEWS_TASK_MQ_CLIENT is not None: news = NEWS_TASK_MQ_CLIENT.receive_message() if news is not None: try: newsHanlder(news) except Exception as e: print "newsHanlder wrong"#coding=utf-8 pass NEWS_TASK_MQ_CLIENT.sleep(SLEEP_SECONDS)
print 'scraping CNN news' text = cnn_news_scraper.extract_news(task['url']) else: print 'news source [%s] is not supported' % task['source'] task['text'] = text ''' # print 'message numbers:' + dedupe_news_queue_client.getMessage_count() dedupe_news_queue_client.sendMessage(task) logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s%(message)s', datefmt='%a %d %b %Y %H:%M:%S' + ',', filename='../logging/news_pipeline.log', filemode='a') logging.info(', ' + 'event_name : ' + 'get_news_text_from_source' + ', ' + 'queue_name : ' + str(config['news_fecher']['SCRAPE_NEWS_TASK_QUEUE_NAME']) + ', ' + 'news_id : ' + str(task['digest'])) while True: if scrape_news_queue_client is not None: msg = scrape_news_queue_client.getMessage() if msg is not None: try: handle_message(msg) except Exception as e: print pass scrape_news_queue_client.sleep( config['news_fecher']['SLEEP_TIME_IN_SECONDS'])
news_sources = news_monitor_config['news_sources'] news_timeout_seconds = int(news_monitor_config['news_timeout_seconds']) sleeptime_seconds = int(news_monitor_config['scrape_queue_client_sleeptime_seconds']) while True: # such a step takes a list of latest news task, but most of them could be old duplicates news_list = news_api_client.getNewsFromSource(news_sources) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') # first level anti-duplicate by redis: only report new news tasks redis_client.set(news_digest, news) redis_client.expire(news_digest, news_timeout_seconds) cloudAMQP_client.sendMessage(news) print 'Fetched %d news.' % num_of_new_news if num_of_new_news != 0: logging.info('news_monitor: Fetched %d news and sent to news_to_scrape queue.' % num_of_new_news) cloudAMQP_client.sleep(sleeptime_seconds)
print "~~~~~~~~~~~~~~ text -> documents success" #calculate the tfidf values tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim.A rows, cols = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: print 'Warning~~~~~~ duplicated news' return msg_task['publishedAt'] = parser.parse(msg_task['publishedAt']) db[NEWS_TABLE_NAME].replace_one({'digest': msg_task['digest']}, msg_task, upsert=True) while True: if DEDEUPER_MQ_CLIENT is not None: msg = DEDEUPER_MQ_CLIENT.receive_message() if msg is not None: try: msgHandler(msg) except Exception as e: print "msgHanlder wrong" #coding=utf-8 pass DEDEUPER_MQ_CLIENT.sleep(SLEEP_SECONDS)
config['news_monitor']['NEWS_SOURCES']) num_of_news_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_news_news = num_of_news_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") redis_client.set(news_digest, "True") redis_client.expire( news_digest, config['news_monitor']['NEWS_TIME_OUT_IN_SECONDS']) cloudAMQP_client.sendMessage(news) print "Fetched %d news." % num_of_news_news logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', filename='NumbersOfNews.log') logging.info('We have %d news monitored' % num_of_news_news) cloudAMQP_client.sleep(config['news_monitor']['SLEEP_TIME_IN_SECONDS'])