def test_get_target_url_does_not_throw_if_empty_if_collection_does_not_exist( self): # given mongo_database = MongoDb() mongo_database._database = mongomock.MongoClient()["newspaper"] # when mongo_database.get_target_urls('de')
def test_MongoDB_get_open_task_returns_none_for_empty_collection(self): # given mongo_database = MongoDb() mongo_database._database = mongomock.MongoClient()["newspaper"] # when task = mongo_database.get_open_task('tr') # then assert task is None
def test_MongoDB_get_open_task_returns_none_if_no_task_open(self): # given mongo_database = MongoDb() mongo_database._database = mongomock.MongoClient()["newspaper"] mongo_database._database["tr"].insert_one( {"text": "This article is scraped"}) # when task = mongo_database.get_open_task("tr") # then assert task is None
def test_MongoDB_get_open_task_returns_open_task(self): # given mongo_database = MongoDb() mongo_database._database = mongomock.MongoClient()["newspaper"] open_task = {"url": "www.opentask.com"} open_task['_id'] = mongo_database._database["de"].insert_one( open_task).inserted_id # when task = mongo_database.get_open_task("de") # then assert task == open_task
def url_processor(language, **context): database = MongoDb() target = database.get_open_task(language) if target is None: logger.info('No task left') else: url = target["url"] logger.info('Extracting data from {}'.format(url)) data = extract_data(url) logger.info('Upserting data') database.insert_article(data, language=language)
def test_insert_into_empty_db(self): # given mongo_database = MongoDb() mongo_database._database = mongomock.MongoClient()["newspaper"] expected_article = { "title": "How Millennials Are Disrupting Test", "url": "www.testytest.com" } # when mongo_database.insert_article(expected_article, "LANGUAGE") # then actual_article = mongo_database._database["LANGUAGE"].find_one({}) assert actual_article["title"] == expected_article["title"]
def test_get_target_url(self): # given mongo_database = MongoDb() mongo_database._database = mongomock.MongoClient()["newspaper"] target_1 = {'language': 'de', 'url': 'www.news1.de'} mongo_database._database["TARGET"].insert_one(target_1) target_2 = {'language': 'de', 'url': 'www.news2.de'} mongo_database._database["TARGET"].insert_one(target_2) # when targets = mongo_database.get_target_urls('de') # then assert len(targets) == 2 assert 'www.news2.de' in targets assert 'www.news1.de' in targets
def test_insert_does_not_create_duplicate(self): # given mongo_database = MongoDb() mongo_database._database = mongomock.MongoClient()["newspaper"] existing_article = { "title": "How Millennials Are Disrupting Test", "url": "www.testytest.com" } existing_article["_id"] = mongo_database._database["newspaper"][ "LANGUAGE"].insert_one(existing_article).inserted_id # when mongo_database.insert_article(existing_article, "LANGUAGE") # then article_count = mongo_database._database["newspaper"][ "LANGUAGE"].count_documents({}) assert article_count == 1
def url_scraper(language, **context): database = MongoDb() newspaper_url = database.get_target_urls(language) for url in newspaper_url: logger.info('Generating TODOs for {}'.format(url)) paper = newspaper.build(url, language=language, memoize_articles=False, fetch_images=False, MIN_WORD_COUNT=100) logger.info('Creating tasks for {}'.format(url)) raw_urls = [article.url for article in paper.articles] cleaned_urls = get_clean_urls(raw_urls) tasks = [{'url': cleaned_url, 'origin': url} for cleaned_url in cleaned_urls] logger.info('Inserting tasks for {}'.format(url)) database.insert_tasks(tasks, language)
def test_insert_data_overwrites_task_entry(self): # given mongo_database = MongoDb() mongo_database._database = mongomock.MongoClient()["newspaper"] mongo_database._database["tr"].insert_one({'url': 'www.bike.com'}) # when mongo_database.insert_article( { 'url': 'www.bike.com', 'text': 'This article is scraped' }, "tr") # then total = [ i for i in mongo_database._database["tr"].find( {'url': 'www.bike.com'}) ] assert total[0]["text"] == "This article is scraped" assert len(total) == 1
def test_insert_tasks_does_not_reinsert_solved_task(self): # given mongo_database = MongoDb() mongo_database._database = mongomock.MongoClient()["newspaper"] mongo_database._database["tr"].insert_one({ 'url': 'www.bike.com', 'text': "this article is scraped" }) # when mongo_database.insert_tasks([{'url': 'www.bike.com'}], "tr") # then total = [ i for i in mongo_database._database["tr"].find( {'url': 'www.bike.com'}) ] assert total[0]["text"] == "this article is scraped" assert len(total) == 1