Пример #1
0
 def setUp(self):
     """Set up test."""
     self.console = console
     self.refresh_rate = refresh.Hourly
     self.datadir = DataDirectory(dirname(__file__) + '/datadir')
     self.index = Index(self.datadir, MagicMock())
     self.filesystem = Filesystem(self.index, self.refresh_rate)
Пример #2
0
    def test_next_url_returns_none_if_no_url_was_found(self):
        """Test _next_url() returns None if no url was found."""
        index = Index()
        index.random_uncrawled_url = MagicMock()
        index.random_uncrawled_url.side_effect = EmptySearchResultException()
        self.crawler = Crawler(self.path_to_url_source, index)

        self.assertEqual(None, self.crawler._next_url())
Пример #3
0
    def test_crawler_can_read_next_url_from_source(self):
        """Test crawler can read next url from source."""
        self.add_url_source('https://example.com')

        self.crawler = Crawler(self.path_to_url_source, Index())
        self.assertEqual(
            Url.from_string('https://example.com').to_string(),
            self.crawler._next_url().to_string())
Пример #4
0
    def test_crawler_can_read_next_url_from_index(self):
        """Test crawler can read next url from source."""
        index = Index()
        url = Url.from_string('https://example.com/foo')
        index.remove_uncrawled_url = MagicMock()
        index.random_uncrawled_url = MagicMock(return_value=url)

        self.crawler = Crawler(self.path_to_url_source, index)

        self.assertEqual(
            Url.from_string('https://example.com/foo').to_string(),
            self.crawler._next_url().to_string())
        index.remove_uncrawled_url.assert_called_with(url.hash())
Пример #5
0
    def test_crawler_removes_urls_read_from_source(self):
        """Test crawler removes urls read from source."""
        self.add_url_source('https://example.com')
        self.add_url_source('https://example.com/foo')
        self.add_url_source('https://example.com/bar')

        self.crawler = Crawler(self.path_to_url_source, Index())

        # first line should now be https://example.com
        self.assertEqual(
            Url.from_string('https://example.com').to_string(),
            self.crawler._next_url().to_string())

        # first line should now be https://example.com/foo
        self.assertEqual(
            Url.from_string('https://example.com/foo').to_string(),
            self.crawler._next_url().to_string())

        # first line should now be https://example.com/bar
        self.assertEqual(
            Url.from_string('https://example.com/bar').to_string(),
            self.crawler._next_url().to_string())

        self.crawler = Crawler(self.path_to_url_source, Index())
Пример #6
0
def _stats_thread(elasticsearch_host: str):
    """Stats thread.

    Prints system and saas statistics every 5th minute

    Args:
        elasticsearch_host: elasticsearch host
    """
    start = time.time()
    last_print = 1
    while Controller.SHOULD_RUN:

        time.sleep(1)
        mins = int(int(time.time() - start) / 60)
        if mins % 5 != 0 or mins <= last_print:
            continue

        index = Index(host=elasticsearch_host)
        last_print = mins

        t = '[throughput]           5m: {}, 15m: {}, 30min: {}, 1h: {}'.format(
            stats.throughput(index, 5),
            stats.throughput(index, 15),
            stats.throughput(index, 30),
            stats.throughput(index, 60),
        )
        ta = '{}  5m: {}, 15m: {}, 30min: {}, 1h: {}'.format(
            '[throughput 1min avg]',
            round(stats.throughput(index, 5) / 5, 2) if mins > 4 else 'n/a',
            round(stats.throughput(index, 15) / 15, 2) if mins > 14 else 'n/a',
            round(stats.throughput(index, 30) / 30, 2) if mins > 29 else 'n/a',
            round(stats.throughput(index, 60) / 60, 2) if mins > 59 else 'n/a',
        )
        load = '[load avg]             1m: {}, 5m: {}, 15min: {}'.format(
            stats.load_avg(1),
            stats.load_avg(5),
            stats.load_avg(15),
        )
        cpu = f'[current cpu usage]    {stats.cpu_usage(10)}%'
        mem = f'[memory usage]         {stats.memory_usage(10)}%'

        for msg in [t, ta, load, cpu, mem]:
            console.p(msg)
Пример #7
0
def _crawler_thread(
    url_file: str,
    ignore_found_urls: bool,
    stay_at_domain: bool,
    elasticsearch_host: str,
    debug: bool,
    thread_id: str
):
    """Crawler thread.

    Args:
        url_file: path to url file
        ignore_found_urls: if crawler should ignore new urls found on
            pages it crawls
        stay_at_domain: if crawler should ignore urls from a different
            domain than the one it was found at
        elasticsearch_host: elasticsearch host
        debug: Display debugging information
        thread_id: id of thread
    """
    try:
        crawler = Crawler(
            url_file=url_file,
            index=Index(host=elasticsearch_host),
            ignore_found_urls=ignore_found_urls,
            stay_at_domain=stay_at_domain,
        )
        while Controller.SHOULD_RUN:
            crawler.tick()
    except UrlFileNotFoundError:
        console.p(f'ERROR: url_file was not found at \'{url_file}\'')
        time.sleep(2)
        Controller.threads[thread_id]['running'] = False
        Controller.stop_all()
    except Exception as e:
        console.p(f'error occured in crawler thread {thread_id}: {e}')
        if debug:
            raise e
    finally:
        Controller.threads[thread_id]['running'] = False
Пример #8
0
    def start_filesystem(
        mountpoint: str,
        datadir: DataDirectory,
        refresh_rate: Type[refresh.RefreshRate],
        elasticsearch_host: str
    ):
        """Start filesystem process.

        FUSE python library will kill the main process,
        forking main process and mounts the filesystem
        from that process instead.

        Args:
            mountpoint: where to mount filesystem
            datadir: Data directory to store pictures in
            refresh_rate: Which refresh rate filesystem should use
                for fetching photos
            elasticsearch_host: elasticsearch host

        Returns:
            True if main process, False if the forked process
            bool
        """
        console.p(f'mounting filesystem at: {real_path(mountpoint)}')

        pid = os.fork()
        if pid != 0:
            Controller.FUSE_PID = pid
            return True

        try:
            Filesystem.mount(
                mountpoint,
                Index(datadir, host=elasticsearch_host),
                refresh_rate
            )
        except RuntimeError as e:
            console.p(f'failed to mount FUSE filesystem: {e}')

        return False
Пример #9
0
def _photographer_thread(
    refresh_rate: Type[refresh.RefreshRate],
    datadir: DataDirectory,
    viewport_width: int,
    viewport_height: int,
    viewport_max_height: Optional[int],
    elasticsearch_host: str,
    debug: bool,
    thread_id: str
):
    """Photographer thread.

    Args:
        refresh_rate: How often photographs should be refreshed
        datadir: Data directory to store pictures in
        viewport_width: width of camera viewport
        viewport_height: height of camera viewport
        viewport_max_height: max height of camera viewport
        elasticsearch_host: elasticsearch host
        debug: Display debugging information
        thread_id: id of thread
    """
    try:
        photographer = p.Photographer(
            Index(host=elasticsearch_host),
            refresh_rate,
            datadir,
            viewport_width,
            viewport_height,
            viewport_max_height
        )
        while Controller.SHOULD_RUN:
            photographer.tick()
    except Exception as e:
        console.p(f'error occured in photographer thread {thread_id}: {e}')
        if debug:
            raise e
    finally:
        Controller.threads[thread_id]['running'] = False
Пример #10
0
 def setUp(self):
     """Set up test."""
     self.index = Index()
     self.datadir = DataDirectory(dirname(__file__) + '/datadir')
     self.photographer = Photographer(self.index, refresh.Hourly,
                                      self.datadir)
Пример #11
0
def main():
    """Entry point for saas."""
    try:

        parser = arguments.get_argument_parser()
        args = parser.parse_args(sys.argv[1:])

        console.DEBUG = args.debug

        JavascriptSnippets.load()

        index = Index(host=args.elasticsearch_host)

        if not index.ping():
            console.p('ERROR: failed to connect to elasticsearch')
            sys.exit()

        if not index.verify():
            if not args.setup_elasticsearch and not args.clear_elasticsearch:
                console.p('ERROR: elasticsearch is not configured')
                console.p('       {} {}'.format(
                    'start saas with --setup-elasticsearch',
                    'to configure elasticsearch'))
                sys.exit()

        datadir = DataDirectory(args.data_dir, args.optimize_storage)

        refresh_rate = {
            'day': refresh.Daily,
            'hour': refresh.Hourly,
            'minute': refresh.EveryMinute,
        }[args.refresh_rate]

        if args.setup_elasticsearch:
            index.create_indices()

        if args.clear_elasticsearch:
            index.clear()
            index.create_indices()

        if args.clear_data_dir:
            datadir.clear()

        if not Controller.start_filesystem(
                mountpoint=args.mountpoint,
                datadir=datadir,
                refresh_rate=refresh_rate,
                elasticsearch_host=args.elasticsearch_host):
            sys.exit()

        Controller.start_stats(elasticsearch_host=args.elasticsearch_host)

        Controller.start_crawlers(amount=args.crawler_threads,
                                  url_file=args.url_file,
                                  ignore_found_urls=args.ignore_found_urls,
                                  stay_at_domain=args.stay_at_domain,
                                  elasticsearch_host=args.elasticsearch_host,
                                  debug=args.debug)

        Controller.start_photographers(
            amount=args.photographer_threads,
            refresh_rate=refresh_rate,
            datadir=datadir,
            viewport_width=args.viewport_width,
            viewport_height=args.viewport_height,
            viewport_max_height=args.viewport_max_height,
            elasticsearch_host=args.elasticsearch_host,
            debug=args.debug)

        while True:

            if args.stop_if_idle == 0:
                time.sleep(10)
                continue

            try:
                crawled = index.timestamp_of_most_recent_document(
                    index.CRAWLED)
                photos = index.timestamp_of_most_recent_document(index.PHOTOS)

                timestamp = photos
                if crawled > timestamp:
                    timestamp = crawled

                seconds = int(time.time()) - timestamp
                mins = int(seconds / 60)
                if mins >= args.stop_if_idle:
                    console.p(f'was idle for {mins} minutes', end='')
                    raise StopIfIdleTimeoutExpired

            except EmptySearchResultException:
                pass
            finally:
                time.sleep(2)

    except (KeyboardInterrupt, StopIfIdleTimeoutExpired):
        console.p(' terminating.')
        Controller.stop_all()
        console.p('')
Пример #12
0
 def setUp(self):
     """Set up test."""
     self.datadir = DataDirectory(dirname(__file__) + '/datadir')
     self.index = Index(self.datadir, MagicMock())