Exemplo n.º 1
0
def test_basic():
    client = RabbitMQClient(HOST, TEST_QUEUE_NAME)

    sentMsg = {'test': 'demo'}
    client.sendMessage(sentMsg)
    client.sleep(10)
    receivedMsg = client.getMessage()
    assert sentMsg == receivedMsg
    print('test_basic passed!')
Exemplo n.º 2
0
        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                # Dupilicated news
                print("Dupilicated news, ignore")
                return

    task['publishedAt'] = parser.parse(task['publishedAt'])

    # Classify news
    title = task['title']
    if title is not None:
        topic = news_topic_modeling_service_client.classify(title)
        task['class'] = topic

    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)


while True:
    if dedupe_news_queue_client is not None:
        msg = dedupe_news_queue_client.getMessage()
        if msg is not None:
            # Parse
            try:
                handle_message(msg)
            except Exception as e:
                print(e)
                pass
        dedupe_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
        print("message is broken")
        return
    task = msg
    article = Article(task['url'])
    article.download()
    article.parse()
    task['text'] = article.text

    # #Only support cnn for now
    # if(task['source']['name'] == 'CNN'):
    #     print("Scraping CNN news")
    #     text = cnn_news_scraper.extract_news(task['url'])
    # else:
    #     print("not supported")
    # task['text'] = text
    dedupe_news_queue_client.sendMessage(task)


while True:
    # Fetch message from queue
    if scrape_news_queue_client is not None:
        msg = scrape_news_queue_client.getMessage()
        if msg is not None:
            # Handle message: scrape news from websites
            try:
                handle_message(msg)
            except Exception as e:
                print(e)
                pass
        scrape_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
Exemplo n.º 4
0
NEWS_SOURCES = [
    'cnn, bbc-news, bloomberg, espn, cnbc, business-insider, abc-news, buzzfeed, bbc-sport, fox-news, the-verge, techradar, talksport, nfl-news, nhl-news, reddit-r-all']

NEWS_TIME_OUT_IN_SECONDS = 3600 * 24
SLEEP_TIME_IN_SECONDS = 60

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
rabbitMQ_client = RabbitMQClient(SCRAPE_NEWS_TASK_QUEUE_HOST, SCRAPE_NEWS_TASK_QUEUE_NAME)

while True:
    news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)

    nums_of_new_news = 0
    for news in news_list:
        news_digest = hashlib.md5(news['title'].encode('utf-8')).hexdigest()

        if redis_client.get(news_digest) is None:
            nums_of_new_news += 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
                news['publishedAt'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
            redis_client.set(news_digest, json.dumps(news))
            redis_client.expire(str(news), NEWS_TIME_OUT_IN_SECONDS)

            rabbitMQ_client.sendMessage(news)

    print("Fetched {} new news" .format(nums_of_new_news))

    rabbitMQ_client.sleep(SLEEP_TIME_IN_SECONDS)