Пример #1
0
    def test_crawlSince(self):
        db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'test')
        parser = DummyParser()

        crawler.crawlSince(db, parser, 40)
        self.assertEqual(db.getLatestNews()['title'], '뉴스 #50')
        self.assertEqual(db.getNewsCount(), 11)  # 50 ~ 40
Пример #2
0
 def test_addFailedCrawl(self):
     db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'test')
     db.addFailedCrawl({
         'provider': 'dummy',
         'title': 'TestTitle',
         'url': 'TestURL',
     })
     self.assertEqual(db.getFailedCrawlCount(), 1)
Пример #3
0
    def test_add_news(self):
        parser = DummyParser()
        n1 = parser.parseNews('dummy://news1.html', 1)

        db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'test')
        db.addNews(n1)
        should_be_n1 = db.getLatestNews()
        self.assertEqual(n1, should_be_n1)
Пример #4
0
    def test_crawlSince_addFailedCrawl(self, mock_parseNews):
        mock_parseNews.side_effect = ValueError()

        db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'test')
        parser = DummyParser()
        self.assertRaises(ValueError, parser.parseNews, 'dummy://news1.html', 1)

        crawler.crawlSince(db, parser, 0)
        self.assertEqual(db.getFailedCrawlCount(), 30)  # Maximum 30 failed crawl allowed per provider
Пример #5
0
    def test_get_latest_news_reverse(self):
        parser = DummyParser()
        n1 = parser.parseNews('dummy://news1.html', 1)
        n2 = parser.parseNews('dummy://news2.html', 2)

        db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'test')
        db.addNews(n2)
        db.addNews(n1)
        self.assertEqual(db.getLatestNews()['publishedAt'], 2)
Пример #6
0
def main():
    if len(sys.argv) != 3:
        print('Usage : crawlermain [source] [crawlStartTime]')
        sys.exit(1)

    parserClassDict = {
        'chosun': chosun.ParserChosun,
        'donga': donga.ParserDonga,
        'hani': hani.ParserHani,
        'khan': khan.ParserKhan
    }

    parserType = sys.argv[1]
    crawlStartTime = int(sys.argv[2])

    parser = parserClassDict[parserType]()
    # db = dbconn.NewsDatabase('mongodb://*****:*****@ds021346.mlab.com:21346/somanews', 'somanews')
    db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'somanews')
    crawler.crawlSince(db, parser, crawlStartTime)
Пример #7
0
def main():
    parserClassDict = {
        'chosun': chosun.ParserChosun,
        'donga': donga.ParserDonga,
        'hani': hani.ParserHani,
        'khan': khan.ParserKhan
    }

    db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'somanews')

    for provider, parserClass in parserClassDict.items():
        print('Crawling from provider %s...' % provider)
        latestNews = db.getLatestNews(provider)

        if not latestNews:
            # Cronable crawler only accept provider
            continue

        parser = parserClass()
        crawlStartTime = latestNews['publishedAt']
        crawler.crawlSince(db, parser, crawlStartTime)