def test_basic(): client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME) sentMsg = {'test':'demo'} client.sendMessage(sentMsg) client.sleep(10) receivedMsg = client.getMessage() assert sentMsg == receivedMsg print 'test_basic passed!'
documents.insert(0, text) # Calculate similarity matrix tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim.A rows, _ = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # Duplicated news. Ignore. print "Duplicated news. Ignore." return task['publishedAt'] = parser.parse(task['publishedAt']) db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True) while True: if cloudAMQP_client is not None: msg = cloudAMQP_client.getMessage() if msg is not None: # Parse and process the task try: handle_message(msg) except Exception as e: print e pass cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
print 'message from news_to_scrape is broken' logging.error('news_fetcher: message from news_to_scrape is broken') return # use Newspaper to scrape the text of news task = msg text = None article = Article(task['url']) article.download() article.parse() task['text'] = article.text # send this news to mq dedupe_news_queue_client.sendMessage(task) logging.info('news_fetcher: news text scraped, loaded and sent to news_to_dedupe queue') while True: if scrape_news_queue_client is not None: msg = scrape_news_queue_client.getMessage() if msg is not None: logging.info('news_fetcher: news task aquired from news_to_scrape queue') try: handle_message(msg) except Exception as e: print 'news_fetcher exception: %s' % e logging.warning('news_fetcher: exception: %s' % e) pass scrape_news_queue_client.sleep(scrape_sleeptime_seconds) if dedupe_news_queue_client is not None: dedupe_news_queue_client.sleep(dedupe_sleeptime_seconds)
for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # Duplicated news. Ignore. print("Duplicated news. Ignore.") return task['publishedAt'] = parser.parse(task['publishedAt']) # Classify news title = task['title'] if title is not None: topic = news_topic_modeling_service_client.classify(title) task['class'] = topic db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True) while True: if cloudAMQP_client is not None: msg = cloudAMQP_client.getMessage() if msg is not None: # Parse and process the task try: handle_message(msg) except Exception as e: print(e) pass cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)
# print article.cleaned_text task['text'] = article.cleaned_text # # Scraping CNN news # text = None # if task['source']['id'] == 'cnn': # print "Scraping CNN news" # text = cnn_news_scraper.extractNews(task['url']) # else: # print "News source [%s] is not supported." % task['source']['name'] # # task['text'] = text dedupe_news_queue_client.sendMessage(task) while True: # fetch msg from queue if scrape_news_queue_client is not None: msg = scrape_news_queue_client.getMessage() if msg is not None: # Handle message try: handle_message(msg) except Exception as e: print e pass scrape_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
documents = [str(news['text']) for news in recent_news_list] documents.insert(0, text) # cal tf-idf similarity tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print pairwise_sim.A rows = pairwise_sim.shape[0] for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: # duplicate news, ignore print 'Duplicate news. Ignore' return task['publishedAt'] = parser.parse(task['publishedAt']) # replace if exist, else insert db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True) while True: if dedupe_news_queue_client is not None: msg = dedupe_news_queue_client.getMessage() if msg is not None: # Parse and process the task try: handle_message(msg) except Exception as e: print 'handle_message error:', e pass dedupe_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
DEDUPE_TASK_QUEUE_NAME = config['cloudAMQP']['deduperTaskQueue']['name'] SLEEP_IN_SECONDS = config['cloudAMQP']['scraperTaskQueue']['sleep'] scrape_task_mq_client = CloudAMQPClient(SCRAPE_TASK_QUEUE_URL, SCRAPE_TASK_QUEUE_NAME) dedupe_task_mq_client = CloudAMQPClient(DEDUPE_TASK_QUEUE_URL, DEDUPE_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict): print('message is invalid') return task = msg article = Article(task['url']) article.download() article.parse() task['text'] = article.text dedupe_task_mq_client.sendMessage(task) while True: if scrape_task_mq_client is not None: msg = scrape_task_mq_client.getMessage() if msg is not None: try: handle_message(msg) except Exception as ex: print(ex) pass scrape_task_mq_client.sleep(SLEEP_IN_SECONDS)
SLEEP_TIME_IN_SECONDS = config['operations']['SLEEP_TIME_IN_SECONDS'] graphitelog_cloudAMQP_client = CloudAMQPClient(LOG_GRAPHITE_TASK_QUEUE_URL, LOG_GRAPHITE_TASK_QUEUE_NAME) def handle_message(msg): if msg is None: log_client.logger.info('message is broken') #print 'message is broken' return counter = statsd.Counter(msg) counter += 1 while True: #fetch message from queue if graphitelog_cloudAMQP_client is not None: msg = graphitelog_cloudAMQP_client.getMessage() if msg is not None: #print 'msg: %s' % msg #handle message try: handle_message(msg) except Exception as e: log_client.logger.error(str(e)) pass #print 'fetch 0 log message...' #graphitelog_cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)