def test_crawlSince(self): db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'test') parser = DummyParser() crawler.crawlSince(db, parser, 40) self.assertEqual(db.getLatestNews()['title'], '뉴스 #50') self.assertEqual(db.getNewsCount(), 11) # 50 ~ 40
def test_addFailedCrawl(self): db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'test') db.addFailedCrawl({ 'provider': 'dummy', 'title': 'TestTitle', 'url': 'TestURL', }) self.assertEqual(db.getFailedCrawlCount(), 1)
def test_add_news(self): parser = DummyParser() n1 = parser.parseNews('dummy://news1.html', 1) db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'test') db.addNews(n1) should_be_n1 = db.getLatestNews() self.assertEqual(n1, should_be_n1)
def test_crawlSince_addFailedCrawl(self, mock_parseNews): mock_parseNews.side_effect = ValueError() db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'test') parser = DummyParser() self.assertRaises(ValueError, parser.parseNews, 'dummy://news1.html', 1) crawler.crawlSince(db, parser, 0) self.assertEqual(db.getFailedCrawlCount(), 30) # Maximum 30 failed crawl allowed per provider
def test_get_latest_news_reverse(self): parser = DummyParser() n1 = parser.parseNews('dummy://news1.html', 1) n2 = parser.parseNews('dummy://news2.html', 2) db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'test') db.addNews(n2) db.addNews(n1) self.assertEqual(db.getLatestNews()['publishedAt'], 2)
def main(): if len(sys.argv) != 3: print('Usage : crawlermain [source] [crawlStartTime]') sys.exit(1) parserClassDict = { 'chosun': chosun.ParserChosun, 'donga': donga.ParserDonga, 'hani': hani.ParserHani, 'khan': khan.ParserKhan } parserType = sys.argv[1] crawlStartTime = int(sys.argv[2]) parser = parserClassDict[parserType]() # db = dbconn.NewsDatabase('mongodb://*****:*****@ds021346.mlab.com:21346/somanews', 'somanews') db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'somanews') crawler.crawlSince(db, parser, crawlStartTime)
def main(): parserClassDict = { 'chosun': chosun.ParserChosun, 'donga': donga.ParserDonga, 'hani': hani.ParserHani, 'khan': khan.ParserKhan } db = dbconn.NewsDatabase('mongodb://localhost:27017/', 'somanews') for provider, parserClass in parserClassDict.items(): print('Crawling from provider %s...' % provider) latestNews = db.getLatestNews(provider) if not latestNews: # Cronable crawler only accept provider continue parser = parserClass() crawlStartTime = latestNews['publishedAt'] crawler.crawlSince(db, parser, crawlStartTime)