tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T rows, cols = pairwise_sim.shape for row in range(1, rows): if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD: print('duplicate news.Ignore') return msg['publishedAt'] = published_at db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']}, msg, upsert=True) else: msg['publishedAt'] = published_at db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']}, msg, upsert=True) while True: if cloudAMQP_client: msg = cloudAMQP_client.receiveMessage() if msg: try: handle_mesage(msg) except Exception as e: print(e) cloudAMQP_client.sleep(SLEEP_TIME_IN_SECOND)
def handle_message(msg): if not msg or not isinstance(msg, dict): print('msg in broken') return text = None #if msg['source'] == 'cnn': #text = news_scrapter.extract_news(msg['url']) #else: #print('News source [%s] is not supported.' % msg['source']) #Download article according the url article = Article(msg['url']) article.download() article.parse() msg['text'] = article.text #sendMessage fecth_news_queue_client.sendMessage(msg) while True: #receive message if scraper_news_queue_client: msg = scraper_news_queue_client.receiveMessage() if msg: try: #handle message handle_message(msg) except Exception as e: print(e) fecth_news_queue_client.sleep(SLEEP_TIME_IN_SECOND)
] #redis REDIS_HOST = 'localhost' REDIS_PORT = 6379 redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) #AMQP_client QUEUE_URL = "amqp://*****:*****@termite.rmq.cloudamqp.com/svowqrcq" QUEUE_NAME = "news-test" cloudAMQP_client = CloudAMQPClient(QUEUE_URL, QUEUE_NAME) #while while True: news_list = news_api_client.getNews(NEWS_SOURCES) number_of_news = 0 for news in news_list: #redis to prevent duplicate #use md5 for title news_digest = hashlib.md5(news['title'].encode('utf-8')).hexdigest() if not redis_client.get(news_digest): number_of_news = number_of_news + 1 news['digest'] = news_digest redis_client.set(news_digest, 'hh') #set expire time redis_client.expire(news_digest, NEWS_TIME_OUT_IN_SECONDS) #send message to queue cloudAMQP_client.sendMessage(news) print('%s number of news' % number_of_news) #sleep cloudAMQP_client.sleep(SLEEP_TIME_TASK_SECONDS)