def setUp(self) -> None: self.clear_env() mock_google_photos.reset_mock() mock_google_photos.upload_media.reset_mock(side_effect=True) mock_twitter.reset_mock(side_effect=True) mock_twitter.make_original_image_url.reset_mock(side_effect=True) mock_store.reset_mock() mock_store.fetch_not_added_tweet_ids.reset_mock(return_value=True) mock_store.fetch_all_failed_upload_medias.reset_mock(return_value=True) mock_store.insert_tweet_info.reset_mock(side_effect=True) mock_store.insert_failed_upload_media.reset_mock(side_effect=True) mock_request.reset_mock(side_effect=True) mock_request.urlretrieve.reset_mock(side_effect=True) mock_makedirs.reset_mock() mock_rmtree.reset_mock() mock_sleep.reset_mock(side_effect=True) mock_crawler_func.reset_mock(side_effect=True, return_value=True) mock_crawler_func2.reset_mock(side_effect=True, return_value=True) mock_google_photos.return_value = mock_google_photos mock_twitter.return_value = mock_twitter mock_store.return_value = mock_store os.environ['SAVE_MODE'] = 'google' self.crawler = Crawler()
def test_crawler_urlsLevelHost_method_returns_correct_result(self): c = Crawler("http://www.elpais.es") c.urlsLevelHost(1) uno = len(c.urls) c.urlsLevelHost(2) dos = len(c.urls) self.assertEqual(dos > 1, True)
async def run(loop): manager = PersistManager(use_index=True) # enable 'use_index' to use Elasticsearch (Part 3) crawler = Crawler(loop=loop, manager=manager) await crawler.get_history() # Retrieve 5 minute history (Part 1) await crawler.run_updates() # Constant updates (Part 2)
def test_crawler_downloadOneUrlNewspaperThread_method_returns_correct_result( self): c = Crawler( "https://politica.elpais.com/politica/2017/08/29/actualidad/1504006030_167758.html" ) c.downloadOneUrlThread("alienigenaviolanenes.html") self.assertEqual(os.path.exists("alienigenaviolanenes.html"), True) self.assertEqual(len(c.files), 1)
def main(): parser = argparse.ArgumentParser() parser.add_argument("url") parser.add_argument("--use-web-api", action='store_true') args = parser.parse_args() if not args.use_web_api: from app.crawler import Crawler import asyncio import logging loop = asyncio.get_event_loop() c = Crawler(logging_level=logging.INFO) found_domains = loop.run_until_complete(c.crawl(args.url)) print(c._domains) else: import requests found_domains = requests.post("http://localhost/count_domains", json={"urls": [args.url]}) found_domains = list(found_domains.json().values())[0] print(f"found {found_domains} domains")
def test_crawler_downloadInit_method_returns_correct_result(self): c = Crawler("http://www.gnu.org") self.assertEqual(c.url, "http://www.gnu.org") self.assertEqual( c.title, "The GNU Operating System and the Free Software Movement")
def test_crawler_urlsLevel1Host_method_returns_correct_result(self): c = Crawler("http://www.elpais.es") c.urlsLevel1Host() self.assertEqual(len(c.urls) > 1, True)
def test_crawler_downloadOneUrlThread_method_returns_correct_result(self): c = Crawler("http://www.elpais.es") c.downloadOneUrlThread("elpais.html") self.assertEqual(os.path.exists("elpais.html"), True)
def test_crawler_downloadOneUrl_method_returns_correct_result(self): c = Crawler("http://www.urjc.es") c.downloadOneUrl("urjc.html") self.assertEqual(os.path.exists("urjc.html"), True)
from flask import Flask from flask_apscheduler import APScheduler from config import Config from app.models import Database from app.crawler import Crawler scheduler = APScheduler() config = Config() db = Database() crawler = Crawler() def create_app(): app = Flask(__name__) app.config.from_object(config) scheduler.init_app(app) scheduler.start() from app.api import api as api_blueprint app.register_blueprint(api_blueprint) return app
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
from app.crawler import Crawler from app.db import DB db = DB() if __name__ == '__main__': crawler = Crawler(db) crawler.run()