Пример #1
0
def test_basic():
    client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME)

    sentMsg = {"test": "test"}
    client.sendMessage(sentMsg)
    receivedMsg = client.getMessage()

    assert sentMsg == receivedMsg
    print "test_basic passed."
Пример #2
0
def test_basic():
    client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME)

    sentMsg = {'test': 1234}
    client.sendMessage(sentMsg)
    client.sleep(10)
    receivedMsg = client.getMessage()
    assert sentMsg == receivedMsg
    print 'test_basic passed!'
Пример #3
0
scrape_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                           SCRAPE_NEWS_TASK_QUEUE_NAME)


def handleMessage(msg):
    if msg is None or not isinstance(msg, dict):
        print "Message is broken."
        return

    # give url to newspaper to get article
    article = Article(msg['url'])
    article.download()
    article.parse()

    print article.text
    msg['text'] = article.text
    dedupe_news_queue_client.sendMessage(msg)


while True:
    # fetch message from queue
    if scrape_news_queue_client is not None:
        msg = scrape_news_queue_client.getMessage()
        if msg is not None:
            # handle message to get the articles
            try:
                handleMessage(msg)
            except Exception as e:
                print e
                pass
            scrape_news_queue_client.sleep(SLEEP_TIME_IN_SECONDS)
Пример #4
0
        documents = [str(news['text']) for news in recent_news_list]
        documents.insert(0, msg_text)

        # calculate TD-IDF matrix
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print pairwise_sim.A

        rows, _ = pairwise_sim.shape

        # compare with recent news
        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                print "Duplicated news. Ignore."
                return

    msg['publishedAt'] = parser.parse(msg['publishedAt'])
    # put into mongodb
    db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']}, msg, upsert=True)

while True:
    if cloudAMQP_client is not None:
        msg = cloudAMQP_client.getMessage()
        if msg is not None:
            try:
                handleMessage(msg)
            except Exception as e:
                print e
                pass
        cloudAMQP_client.sleep(SLEEP_TIME_IN_SECONDS)