def setUp(self): """Set up test.""" self.console = console self.refresh_rate = refresh.Hourly self.datadir = DataDirectory(dirname(__file__) + '/datadir') self.index = Index(self.datadir, MagicMock()) self.filesystem = Filesystem(self.index, self.refresh_rate)
def test_next_url_returns_none_if_no_url_was_found(self): """Test _next_url() returns None if no url was found.""" index = Index() index.random_uncrawled_url = MagicMock() index.random_uncrawled_url.side_effect = EmptySearchResultException() self.crawler = Crawler(self.path_to_url_source, index) self.assertEqual(None, self.crawler._next_url())
def test_crawler_can_read_next_url_from_source(self): """Test crawler can read next url from source.""" self.add_url_source('https://example.com') self.crawler = Crawler(self.path_to_url_source, Index()) self.assertEqual( Url.from_string('https://example.com').to_string(), self.crawler._next_url().to_string())
def test_crawler_can_read_next_url_from_index(self): """Test crawler can read next url from source.""" index = Index() url = Url.from_string('https://example.com/foo') index.remove_uncrawled_url = MagicMock() index.random_uncrawled_url = MagicMock(return_value=url) self.crawler = Crawler(self.path_to_url_source, index) self.assertEqual( Url.from_string('https://example.com/foo').to_string(), self.crawler._next_url().to_string()) index.remove_uncrawled_url.assert_called_with(url.hash())
def test_crawler_removes_urls_read_from_source(self): """Test crawler removes urls read from source.""" self.add_url_source('https://example.com') self.add_url_source('https://example.com/foo') self.add_url_source('https://example.com/bar') self.crawler = Crawler(self.path_to_url_source, Index()) # first line should now be https://example.com self.assertEqual( Url.from_string('https://example.com').to_string(), self.crawler._next_url().to_string()) # first line should now be https://example.com/foo self.assertEqual( Url.from_string('https://example.com/foo').to_string(), self.crawler._next_url().to_string()) # first line should now be https://example.com/bar self.assertEqual( Url.from_string('https://example.com/bar').to_string(), self.crawler._next_url().to_string()) self.crawler = Crawler(self.path_to_url_source, Index())
def _stats_thread(elasticsearch_host: str): """Stats thread. Prints system and saas statistics every 5th minute Args: elasticsearch_host: elasticsearch host """ start = time.time() last_print = 1 while Controller.SHOULD_RUN: time.sleep(1) mins = int(int(time.time() - start) / 60) if mins % 5 != 0 or mins <= last_print: continue index = Index(host=elasticsearch_host) last_print = mins t = '[throughput] 5m: {}, 15m: {}, 30min: {}, 1h: {}'.format( stats.throughput(index, 5), stats.throughput(index, 15), stats.throughput(index, 30), stats.throughput(index, 60), ) ta = '{} 5m: {}, 15m: {}, 30min: {}, 1h: {}'.format( '[throughput 1min avg]', round(stats.throughput(index, 5) / 5, 2) if mins > 4 else 'n/a', round(stats.throughput(index, 15) / 15, 2) if mins > 14 else 'n/a', round(stats.throughput(index, 30) / 30, 2) if mins > 29 else 'n/a', round(stats.throughput(index, 60) / 60, 2) if mins > 59 else 'n/a', ) load = '[load avg] 1m: {}, 5m: {}, 15min: {}'.format( stats.load_avg(1), stats.load_avg(5), stats.load_avg(15), ) cpu = f'[current cpu usage] {stats.cpu_usage(10)}%' mem = f'[memory usage] {stats.memory_usage(10)}%' for msg in [t, ta, load, cpu, mem]: console.p(msg)
def _crawler_thread( url_file: str, ignore_found_urls: bool, stay_at_domain: bool, elasticsearch_host: str, debug: bool, thread_id: str ): """Crawler thread. Args: url_file: path to url file ignore_found_urls: if crawler should ignore new urls found on pages it crawls stay_at_domain: if crawler should ignore urls from a different domain than the one it was found at elasticsearch_host: elasticsearch host debug: Display debugging information thread_id: id of thread """ try: crawler = Crawler( url_file=url_file, index=Index(host=elasticsearch_host), ignore_found_urls=ignore_found_urls, stay_at_domain=stay_at_domain, ) while Controller.SHOULD_RUN: crawler.tick() except UrlFileNotFoundError: console.p(f'ERROR: url_file was not found at \'{url_file}\'') time.sleep(2) Controller.threads[thread_id]['running'] = False Controller.stop_all() except Exception as e: console.p(f'error occured in crawler thread {thread_id}: {e}') if debug: raise e finally: Controller.threads[thread_id]['running'] = False
def start_filesystem( mountpoint: str, datadir: DataDirectory, refresh_rate: Type[refresh.RefreshRate], elasticsearch_host: str ): """Start filesystem process. FUSE python library will kill the main process, forking main process and mounts the filesystem from that process instead. Args: mountpoint: where to mount filesystem datadir: Data directory to store pictures in refresh_rate: Which refresh rate filesystem should use for fetching photos elasticsearch_host: elasticsearch host Returns: True if main process, False if the forked process bool """ console.p(f'mounting filesystem at: {real_path(mountpoint)}') pid = os.fork() if pid != 0: Controller.FUSE_PID = pid return True try: Filesystem.mount( mountpoint, Index(datadir, host=elasticsearch_host), refresh_rate ) except RuntimeError as e: console.p(f'failed to mount FUSE filesystem: {e}') return False
def _photographer_thread( refresh_rate: Type[refresh.RefreshRate], datadir: DataDirectory, viewport_width: int, viewport_height: int, viewport_max_height: Optional[int], elasticsearch_host: str, debug: bool, thread_id: str ): """Photographer thread. Args: refresh_rate: How often photographs should be refreshed datadir: Data directory to store pictures in viewport_width: width of camera viewport viewport_height: height of camera viewport viewport_max_height: max height of camera viewport elasticsearch_host: elasticsearch host debug: Display debugging information thread_id: id of thread """ try: photographer = p.Photographer( Index(host=elasticsearch_host), refresh_rate, datadir, viewport_width, viewport_height, viewport_max_height ) while Controller.SHOULD_RUN: photographer.tick() except Exception as e: console.p(f'error occured in photographer thread {thread_id}: {e}') if debug: raise e finally: Controller.threads[thread_id]['running'] = False
def setUp(self): """Set up test.""" self.index = Index() self.datadir = DataDirectory(dirname(__file__) + '/datadir') self.photographer = Photographer(self.index, refresh.Hourly, self.datadir)
def main(): """Entry point for saas.""" try: parser = arguments.get_argument_parser() args = parser.parse_args(sys.argv[1:]) console.DEBUG = args.debug JavascriptSnippets.load() index = Index(host=args.elasticsearch_host) if not index.ping(): console.p('ERROR: failed to connect to elasticsearch') sys.exit() if not index.verify(): if not args.setup_elasticsearch and not args.clear_elasticsearch: console.p('ERROR: elasticsearch is not configured') console.p(' {} {}'.format( 'start saas with --setup-elasticsearch', 'to configure elasticsearch')) sys.exit() datadir = DataDirectory(args.data_dir, args.optimize_storage) refresh_rate = { 'day': refresh.Daily, 'hour': refresh.Hourly, 'minute': refresh.EveryMinute, }[args.refresh_rate] if args.setup_elasticsearch: index.create_indices() if args.clear_elasticsearch: index.clear() index.create_indices() if args.clear_data_dir: datadir.clear() if not Controller.start_filesystem( mountpoint=args.mountpoint, datadir=datadir, refresh_rate=refresh_rate, elasticsearch_host=args.elasticsearch_host): sys.exit() Controller.start_stats(elasticsearch_host=args.elasticsearch_host) Controller.start_crawlers(amount=args.crawler_threads, url_file=args.url_file, ignore_found_urls=args.ignore_found_urls, stay_at_domain=args.stay_at_domain, elasticsearch_host=args.elasticsearch_host, debug=args.debug) Controller.start_photographers( amount=args.photographer_threads, refresh_rate=refresh_rate, datadir=datadir, viewport_width=args.viewport_width, viewport_height=args.viewport_height, viewport_max_height=args.viewport_max_height, elasticsearch_host=args.elasticsearch_host, debug=args.debug) while True: if args.stop_if_idle == 0: time.sleep(10) continue try: crawled = index.timestamp_of_most_recent_document( index.CRAWLED) photos = index.timestamp_of_most_recent_document(index.PHOTOS) timestamp = photos if crawled > timestamp: timestamp = crawled seconds = int(time.time()) - timestamp mins = int(seconds / 60) if mins >= args.stop_if_idle: console.p(f'was idle for {mins} minutes', end='') raise StopIfIdleTimeoutExpired except EmptySearchResultException: pass finally: time.sleep(2) except (KeyboardInterrupt, StopIfIdleTimeoutExpired): console.p(' terminating.') Controller.stop_all() console.p('')
def setUp(self): """Set up test.""" self.datadir = DataDirectory(dirname(__file__) + '/datadir') self.index = Index(self.datadir, MagicMock())