Пример #1
0
 def setUp(self):
     """Set up test."""
     self.console = console
     self.refresh_rate = refresh.Hourly
     self.datadir = DataDirectory(dirname(__file__) + '/datadir')
     self.index = Index(self.datadir, MagicMock())
     self.filesystem = Filesystem(self.index, self.refresh_rate)
Пример #2
0
    def test_next_url_returns_none_if_no_url_was_found(self):
        """Test _next_url() returns None if no url was found."""
        index = Index()
        index.random_uncrawled_url = MagicMock()
        index.random_uncrawled_url.side_effect = EmptySearchResultException()
        self.crawler = Crawler(self.path_to_url_source, index)

        self.assertEqual(None, self.crawler._next_url())
Пример #3
0
    def test_crawler_can_read_next_url_from_index(self):
        """Test crawler can read next url from source."""
        index = Index()
        url = Url.from_string('https://example.com/foo')
        index.remove_uncrawled_url = MagicMock()
        index.random_uncrawled_url = MagicMock(return_value=url)

        self.crawler = Crawler(self.path_to_url_source, index)

        self.assertEqual(
            Url.from_string('https://example.com/foo').to_string(),
            self.crawler._next_url().to_string())
        index.remove_uncrawled_url.assert_called_with(url.hash())
Пример #4
0
    def test_crawler_can_read_next_url_from_source(self):
        """Test crawler can read next url from source."""
        self.add_url_source('https://example.com')

        self.crawler = Crawler(self.path_to_url_source, Index())
        self.assertEqual(
            Url.from_string('https://example.com').to_string(),
            self.crawler._next_url().to_string())
Пример #5
0
def throughput(index: Index, timeframe: int) -> int:
    """Get current throughput.

    Args:
        index: Index photos are stored in
        timeframe: timeframe in minutes

    Returns:
        number of photos stored in index during timeframe
        int
    """
    return index.calculate_throughput(timeframe)
Пример #6
0
    def _update(domain: str, index: idx.Index,
                refresh_rate: Type[RefreshRate]):
        """Update cache.

        Args:
            domain: domain to cache
            index: Index photos are stored in
            refresh_rate: refresh rate capture should be for
        """
        capture = index.photos_most_recent_capture_of_domain(
            domain, refresh_rate)
        LastCapture.captures[domain] = capture
        LastCapture.cached_at[domain] = time.time()
Пример #7
0
    def test_crawler_removes_urls_read_from_source(self):
        """Test crawler removes urls read from source."""
        self.add_url_source('https://example.com')
        self.add_url_source('https://example.com/foo')
        self.add_url_source('https://example.com/bar')

        self.crawler = Crawler(self.path_to_url_source, Index())

        # first line should now be https://example.com
        self.assertEqual(
            Url.from_string('https://example.com').to_string(),
            self.crawler._next_url().to_string())

        # first line should now be https://example.com/foo
        self.assertEqual(
            Url.from_string('https://example.com/foo').to_string(),
            self.crawler._next_url().to_string())

        # first line should now be https://example.com/bar
        self.assertEqual(
            Url.from_string('https://example.com/bar').to_string(),
            self.crawler._next_url().to_string())

        self.crawler = Crawler(self.path_to_url_source, Index())
Пример #8
0
def _stats_thread(elasticsearch_host: str):
    """Stats thread.

    Prints system and saas statistics every 5th minute

    Args:
        elasticsearch_host: elasticsearch host
    """
    start = time.time()
    last_print = 1
    while Controller.SHOULD_RUN:

        time.sleep(1)
        mins = int(int(time.time() - start) / 60)
        if mins % 5 != 0 or mins <= last_print:
            continue

        index = Index(host=elasticsearch_host)
        last_print = mins

        t = '[throughput]           5m: {}, 15m: {}, 30min: {}, 1h: {}'.format(
            stats.throughput(index, 5),
            stats.throughput(index, 15),
            stats.throughput(index, 30),
            stats.throughput(index, 60),
        )
        ta = '{}  5m: {}, 15m: {}, 30min: {}, 1h: {}'.format(
            '[throughput 1min avg]',
            round(stats.throughput(index, 5) / 5, 2) if mins > 4 else 'n/a',
            round(stats.throughput(index, 15) / 15, 2) if mins > 14 else 'n/a',
            round(stats.throughput(index, 30) / 30, 2) if mins > 29 else 'n/a',
            round(stats.throughput(index, 60) / 60, 2) if mins > 59 else 'n/a',
        )
        load = '[load avg]             1m: {}, 5m: {}, 15min: {}'.format(
            stats.load_avg(1),
            stats.load_avg(5),
            stats.load_avg(15),
        )
        cpu = f'[current cpu usage]    {stats.cpu_usage(10)}%'
        mem = f'[memory usage]         {stats.memory_usage(10)}%'

        for msg in [t, ta, load, cpu, mem]:
            console.p(msg)
Пример #9
0
def _crawler_thread(
    url_file: str,
    ignore_found_urls: bool,
    stay_at_domain: bool,
    elasticsearch_host: str,
    debug: bool,
    thread_id: str
):
    """Crawler thread.

    Args:
        url_file: path to url file
        ignore_found_urls: if crawler should ignore new urls found on
            pages it crawls
        stay_at_domain: if crawler should ignore urls from a different
            domain than the one it was found at
        elasticsearch_host: elasticsearch host
        debug: Display debugging information
        thread_id: id of thread
    """
    try:
        crawler = Crawler(
            url_file=url_file,
            index=Index(host=elasticsearch_host),
            ignore_found_urls=ignore_found_urls,
            stay_at_domain=stay_at_domain,
        )
        while Controller.SHOULD_RUN:
            crawler.tick()
    except UrlFileNotFoundError:
        console.p(f'ERROR: url_file was not found at \'{url_file}\'')
        time.sleep(2)
        Controller.threads[thread_id]['running'] = False
        Controller.stop_all()
    except Exception as e:
        console.p(f'error occured in crawler thread {thread_id}: {e}')
        if debug:
            raise e
    finally:
        Controller.threads[thread_id]['running'] = False
Пример #10
0
    def start_filesystem(
        mountpoint: str,
        datadir: DataDirectory,
        refresh_rate: Type[refresh.RefreshRate],
        elasticsearch_host: str
    ):
        """Start filesystem process.

        FUSE python library will kill the main process,
        forking main process and mounts the filesystem
        from that process instead.

        Args:
            mountpoint: where to mount filesystem
            datadir: Data directory to store pictures in
            refresh_rate: Which refresh rate filesystem should use
                for fetching photos
            elasticsearch_host: elasticsearch host

        Returns:
            True if main process, False if the forked process
            bool
        """
        console.p(f'mounting filesystem at: {real_path(mountpoint)}')

        pid = os.fork()
        if pid != 0:
            Controller.FUSE_PID = pid
            return True

        try:
            Filesystem.mount(
                mountpoint,
                Index(datadir, host=elasticsearch_host),
                refresh_rate
            )
        except RuntimeError as e:
            console.p(f'failed to mount FUSE filesystem: {e}')

        return False
Пример #11
0
def _photographer_thread(
    refresh_rate: Type[refresh.RefreshRate],
    datadir: DataDirectory,
    viewport_width: int,
    viewport_height: int,
    viewport_max_height: Optional[int],
    elasticsearch_host: str,
    debug: bool,
    thread_id: str
):
    """Photographer thread.

    Args:
        refresh_rate: How often photographs should be refreshed
        datadir: Data directory to store pictures in
        viewport_width: width of camera viewport
        viewport_height: height of camera viewport
        viewport_max_height: max height of camera viewport
        elasticsearch_host: elasticsearch host
        debug: Display debugging information
        thread_id: id of thread
    """
    try:
        photographer = p.Photographer(
            Index(host=elasticsearch_host),
            refresh_rate,
            datadir,
            viewport_width,
            viewport_height,
            viewport_max_height
        )
        while Controller.SHOULD_RUN:
            photographer.tick()
    except Exception as e:
        console.p(f'error occured in photographer thread {thread_id}: {e}')
        if debug:
            raise e
    finally:
        Controller.threads[thread_id]['running'] = False
Пример #12
0
 def setUp(self):
     """Set up test."""
     self.index = Index()
     self.datadir = DataDirectory(dirname(__file__) + '/datadir')
     self.photographer = Photographer(self.index, refresh.Hourly,
                                      self.datadir)
Пример #13
0
def main():
    """Entry point for saas."""
    try:

        parser = arguments.get_argument_parser()
        args = parser.parse_args(sys.argv[1:])

        console.DEBUG = args.debug

        JavascriptSnippets.load()

        index = Index(host=args.elasticsearch_host)

        if not index.ping():
            console.p('ERROR: failed to connect to elasticsearch')
            sys.exit()

        if not index.verify():
            if not args.setup_elasticsearch and not args.clear_elasticsearch:
                console.p('ERROR: elasticsearch is not configured')
                console.p('       {} {}'.format(
                    'start saas with --setup-elasticsearch',
                    'to configure elasticsearch'))
                sys.exit()

        datadir = DataDirectory(args.data_dir, args.optimize_storage)

        refresh_rate = {
            'day': refresh.Daily,
            'hour': refresh.Hourly,
            'minute': refresh.EveryMinute,
        }[args.refresh_rate]

        if args.setup_elasticsearch:
            index.create_indices()

        if args.clear_elasticsearch:
            index.clear()
            index.create_indices()

        if args.clear_data_dir:
            datadir.clear()

        if not Controller.start_filesystem(
                mountpoint=args.mountpoint,
                datadir=datadir,
                refresh_rate=refresh_rate,
                elasticsearch_host=args.elasticsearch_host):
            sys.exit()

        Controller.start_stats(elasticsearch_host=args.elasticsearch_host)

        Controller.start_crawlers(amount=args.crawler_threads,
                                  url_file=args.url_file,
                                  ignore_found_urls=args.ignore_found_urls,
                                  stay_at_domain=args.stay_at_domain,
                                  elasticsearch_host=args.elasticsearch_host,
                                  debug=args.debug)

        Controller.start_photographers(
            amount=args.photographer_threads,
            refresh_rate=refresh_rate,
            datadir=datadir,
            viewport_width=args.viewport_width,
            viewport_height=args.viewport_height,
            viewport_max_height=args.viewport_max_height,
            elasticsearch_host=args.elasticsearch_host,
            debug=args.debug)

        while True:

            if args.stop_if_idle == 0:
                time.sleep(10)
                continue

            try:
                crawled = index.timestamp_of_most_recent_document(
                    index.CRAWLED)
                photos = index.timestamp_of_most_recent_document(index.PHOTOS)

                timestamp = photos
                if crawled > timestamp:
                    timestamp = crawled

                seconds = int(time.time()) - timestamp
                mins = int(seconds / 60)
                if mins >= args.stop_if_idle:
                    console.p(f'was idle for {mins} minutes', end='')
                    raise StopIfIdleTimeoutExpired

            except EmptySearchResultException:
                pass
            finally:
                time.sleep(2)

    except (KeyboardInterrupt, StopIfIdleTimeoutExpired):
        console.p(' terminating.')
        Controller.stop_all()
        console.p('')
Пример #14
0
class TestFilesystem(unittest.TestCase):
    """Test filesystem class."""
    def setUp(self):
        """Set up test."""
        self.console = console
        self.refresh_rate = refresh.Hourly
        self.datadir = DataDirectory(dirname(__file__) + '/datadir')
        self.index = Index(self.datadir, MagicMock())
        self.filesystem = Filesystem(self.index, self.refresh_rate)

    def tearDown(self):
        """Tear down test."""
        self.datadir.remove_data_dir()

    def assertListOfFilesEqual(self, expected: list, actual: list):
        """Assert list of files equal.

        Args:
            expected: Expected list of files
            actual: Actual list of files
        """
        msg = 'Failed asserting list of files where equal expected'
        self.assertEqual(len(expected), len(actual), msg=msg)
        for i, file in enumerate(expected):
            self.assertEqual(file.filename, actual[i].filename, msg=msg)
            self.assertIsInstance(cls=file.__class__, obj=actual[i], msg=msg)

    def test_filesystem_can_list_contents_of_root_directory(self):
        """Test filesystem can list root directory."""
        self.index.photos_unique_domains = MagicMock(
            return_value=['example.com', 'example.net'])

        files = self.filesystem._list('/')

        self.assertListOfFilesEqual([
            Directory('.'),
            Directory('..'),
            Directory('example.com'),
            Directory('example.net'),
        ], files)
        self.index.photos_unique_domains.assert_called_with(self.refresh_rate)

    def test_filesystem_can_list_contents_of_domain(self):
        """Test filesystem can list contents of domain."""
        self.index.photos_unique_captures_of_domain = MagicMock(return_value=[
            '2019-01-13H20:00',
            '2019-01-13H21:00',
            '2019-01-13H22:00',
        ])

        expected = [
            Directory('.'),
            Directory('..'),
            Directory('2019-01-13H20:00'),
            Directory('2019-01-13H21:00'),
            Directory('2019-01-13H22:00'),
            Directory(LastCapture.FILENAME),
        ]

        files = self.filesystem._list('/example.com')
        self.assertListOfFilesEqual(expected, files)

        files = self.filesystem._list('/example.com/')
        self.assertListOfFilesEqual(expected, files)

        self.index.photos_unique_captures_of_domain.assert_called_with(
            'example.com', self.refresh_rate)

    def test_filesystem_can_list_contents_of_capture_at_given_path(self):
        """Test filesystem can list contents of capture at given path."""
        self.index.photos_list_files_in_directory = MagicMock(return_value=[
            'index.png',
            'contact.png',
            'about.png',
        ])
        self.index.photos_list_directories_in_directory = MagicMock(
            return_value=[
                'sub_dir_1',
                'sub_dir_2',
            ])

        expected = [
            Directory('.'),
            Directory('..'),
            File('index.png'),
            File('contact.png'),
            File('about.png'),
            Directory('sub_dir_1'),
            Directory('sub_dir_2'),
        ]

        files = self.filesystem._list('/example.com/2019-01-13H20:00/')
        self.assertListOfFilesEqual(expected, files)

        files = self.filesystem._list('/example.com/2019-01-13H20:00')
        self.assertListOfFilesEqual(expected, files)

        files = self.filesystem._list('/example.com/2019-01-13H20:00/foo/bar/')
        self.assertListOfFilesEqual(expected, files)

        calls = [
            call('example.com', '2019-01-13H20:00', '/', self.refresh_rate),
            call('example.com', '2019-01-13H20:00', '/', self.refresh_rate),
            call('example.com', '2019-01-13H20:00', '/foo/bar/',
                 self.refresh_rate),
        ]
        self.index.photos_list_files_in_directory.assert_has_calls(calls)
        self.index.photos_list_directories_in_directory.assert_has_calls(calls)

    def test_filesystem_can_get_attributes_of_directory(self):
        """Test filesystem can get attributes of directory."""
        time.time = MagicMock(return_value=time.time())
        self.index.photos_directory_exists = MagicMock(return_value=True)
        self.index.photos_unique_domains = MagicMock(
            return_value=['example.com'])
        self.index.photos_unique_captures_of_domain = MagicMock(
            return_value=['2019-01-13H20:00'])

        expected = {
            'st_atime': time.time(),
            'st_ctime': time.time(),
            'st_gid': os.getgid(),
            'st_mode': Directory('').ST_MODE,
            'st_mtime': time.time(),
            'st_size': 0,
            'st_uid': os.getuid(),
        }

        attr = self.filesystem._attributes('/')
        self.assertEqual(expected, attr)

        attr = self.filesystem._attributes('/example.com/')
        self.assertEqual(expected, attr)

        attr = self.filesystem._attributes('/example.com/2019-01-13H20:00')
        self.assertEqual(expected, attr)

        attr = self.filesystem._attributes('/example.com/2019-01-13H20:00/')
        self.assertEqual(expected, attr)

        attr = self.filesystem._attributes(
            '/example.com/2019-01-13H20:00/foo/bar')
        self.assertEqual(expected, attr)
        self.index.photos_directory_exists.assert_called_with(
            domain='example.com',
            captured_at='2019-01-13H20:00',
            directory='/foo/bar/',
            refresh_rate=self.refresh_rate)

    def test_filesystem_can_get_attributes_of_file(self):
        """Test filesystem can get attributes of file."""
        time.time = MagicMock(return_value=time.time())
        self.index.photos_directory_exists = MagicMock(return_value=False)
        self.index.photos_file_exists = MagicMock(
            return_value=123000  # returns filesize
        )

        expected = {
            'st_atime': time.time(),
            'st_ctime': time.time(),
            'st_gid': os.getgid(),
            'st_mode': File('').ST_MODE,
            'st_mtime': time.time(),
            'st_size': 123000,
            'st_uid': os.getuid(),
        }

        attr = self.filesystem._attributes(
            '/example.com/2019-01-13H20:00/index.png')
        self.assertEqual(expected, attr)
        self.index.photos_file_exists.assert_called_with(
            domain='example.com',
            captured_at='2019-01-13H20:00',
            full_filename='/index.png',
            refresh_rate=self.refresh_rate)

    def test_filesystem_can_translate_path_to_file_in_datadir(self):
        """Test filesystem can translate path to file in datadir."""
        datadir_path = PhotoPath(self.datadir)
        url = Url.from_string('https://example.com/foo/bar')
        photo = Screenshot(url, datadir_path, self.refresh_rate)
        self.index.es.index = MagicMock()
        photo.path.filesize = MagicMock(return_value=10000)
        self.index.save_photo(photo)

        self.index.photos_file_exists = MagicMock(return_value=123000)
        self.index.photos_get_photo = MagicMock(return_value=photo)

        path = self.filesystem._translate_path(
            '/example.com/2019-01-13H20:00/foo/bar.png')
        self.assertEqual(datadir_path.full_path(), path)
Пример #15
0
 def setUp(self):
     """Set up test."""
     self.datadir = DataDirectory(dirname(__file__) + '/datadir')
     self.index = Index(self.datadir, MagicMock())
Пример #16
0
class TestIndex(unittest.TestCase):
    """Test index class."""
    def setUp(self):
        """Set up test."""
        self.datadir = DataDirectory(dirname(__file__) + '/datadir')
        self.index = Index(self.datadir, MagicMock())

    def tearDown(self):
        """Tear down test."""
        self.datadir.remove_data_dir()

    def search_returns_doc(self, doc: dict):
        """Search to elastic search returns doc.

        Mock search method of self.index.es to return given doc

        Args:
            doc: document or partial document to return
        """
        self.index.es.search = MagicMock(
            return_value={'hits': {
                'total': 1,
                'hits': [doc]
            }})

    def search_returns_aggregation(self, index: str, buckets: list):
        """Search to elastic search returns aggregation.

        Args:
            index: index being searched
            buckets: buckets that's returned
        """
        self.index.es.search = MagicMock(
            return_value={'aggregations': {
                index: {
                    'buckets': buckets
                }
            }})

    def test_recently_crawled_url_can_be_fetched(self):
        """Test recently crawled url can be fetched."""
        self.search_returns_doc({
            '_id': 'xxx...',
            '_source': {
                'url': 'http://example.com',
                'timestamp': 1547229873.257901
            }
        })

        url = self.index.recently_crawled_url(refresh.Hourly)
        self.assertIsInstance(cls=Url, obj=url)
        self.assertEqual('http://example.com', url.to_string())
        self.index.es.search.assert_called_with(
            index='crawled',
            size=5,
            body={
                'query': {
                    'bool': {
                        'must': {
                            'term': {
                                'status_code': 200,
                            }
                        },
                        'must_not': [{
                            'term': {
                                'lock_value': refresh.Hourly().lock(),
                            }
                        }]
                    }
                },
                'sort': [{
                    'timestamp': {
                        'order': 'desc'
                    }
                }]
            })

    def test_lock_can_be_placed_on_crawled_url(self):
        """Test lock can be placed on crawled url."""
        url = Url.from_string('http://example.com')
        self.index.es.update = MagicMock()

        self.index.lock_crawled_url(url, refresh.Hourly)
        self.index.es.update.assert_called_with(
            index='crawled',
            doc_type='url',
            id=url.hash(),
            retry_on_conflict=3,
            body={
                'doc': {
                    'lock_format': refresh.Hourly.lock_format(),
                    'lock_value': refresh.Hourly().lock(),
                }
            })

    def test_index_can_store_photo(self):
        """Test index can store a photo."""
        self.index.es.index = MagicMock()
        time.time = MagicMock(return_value=time.time())

        url = Url.from_string('http://example.com')
        path = PhotoPath(self.datadir)
        path.filesize = MagicMock(return_value=10000)

        photo = LoadingPhoto(url=url, path=path, refresh_rate=refresh.Hourly)

        self.index.save_photo(photo)
        self.index.es.index.assert_called_with(
            index='photos',
            doc_type='photo',
            id=path.uuid,
            body={
                'url_id': url.hash(),
                'refresh_rate': refresh.Hourly.lock_format(),
                'captured_at': refresh.Hourly().lock(),
                'filesize': photo.filesize(),
                'filename': photo.filename(),
                'directory': photo.directory(),
                'domain': photo.domain(),
                'timestamp': int(time.time())
            })

    def test_index_can_list_unique_photo_domains(self):
        """Test index can list unique photos."""
        self.search_returns_aggregation('photos', [{
            'key': 'example.com',
        }, {
            'key': 'example.net',
        }])

        domains = self.index.photos_unique_domains(refresh.Hourly)

        self.assertEqual(['example.com', 'example.net'], domains)
        self.index.es.search.assert_called_with(
            index='photos',
            size=0,
            body={
                'query': {
                    'bool': {
                        'must': {
                            'term': {
                                'refresh_rate': refresh.Hourly.lock_format(),
                            }
                        },
                    }
                },
                'aggs': {
                    'photos': {
                        'terms': {
                            'field': 'domain',
                            'size': 10000
                        }
                    }
                }
            })

    def test_index_can_list_unique_captures_of_domains(self):
        """Test index can list unique captures of domain."""
        self.search_returns_aggregation('photos', [{
            'key': '2019-01-13H20:00',
        }, {
            'key': '2019-01-13H21:00',
        }])

        domains = self.index.photos_unique_captures_of_domain(
            'example.com', refresh.Hourly)

        format = refresh.Hourly.lock_format()
        self.assertEqual(['2019-01-13H20:00', '2019-01-13H21:00'], domains)
        self.index.es.search.assert_called_with(
            index='photos',
            size=0,
            body={
                'query': {
                    'bool': {
                        'must': [{
                            'term': {
                                'domain': 'example.com',
                            }
                        }, {
                            'term': {
                                'refresh_rate': format,
                            }
                        }],
                    }
                },
                'aggs': {
                    'photos': {
                        'terms': {
                            'field': 'captured_at',
                            'size': 10000
                        }
                    }
                }
            })

    def test_photo_can_be_retrieved(self):
        """Test photo can be retrieved."""
        format = refresh.Hourly.lock_format()
        capture = refresh.Hourly().lock()
        self.search_returns_doc({
            '_id': 'uuid-xxx...',
            '_source': {
                'url_id': 'xxx...',
                'refresh_rate': format,
                'captured_at': capture,
                'filename': 'some-filename.png',
                'directory': '/some/path/',
                'domain': 'example.com',
                'filesize': 12300,
                'timestamp': time.time(),
            }
        })

        photo = self.index.photos_get_photo(
            domain='example.com',
            captured_at=capture,
            full_filename='/some/path/some-filename.png',
            refresh_rate=refresh.Hourly)

        self.assertIsInstance(cls=Photo, obj=photo)
        self.assertEqual('uuid-xxx...', photo.path.uuid)
        self.index.es.search.assert_called_with(
            index='photos',
            size=1,
            body={
                'query': {
                    'bool': {
                        'must': [{
                            'term': {
                                'domain': 'example.com',
                            }
                        }, {
                            'term': {
                                'refresh_rate': format,
                            }
                        }, {
                            'term': {
                                'captured_at': capture,
                            }
                        }, {
                            'term': {
                                'directory': '/some/path/',
                            }
                        }, {
                            'term': {
                                'filename': 'some-filename.png',
                            }
                        }],
                    }
                }
            })

    def test_directories_within_a_directory_can_be_fetched(self):
        """Test directories within a directory can be fetched."""
        format = refresh.Hourly.lock_format()
        capture = refresh.Hourly().lock()
        self.search_returns_aggregation(
            'photos', [{
                'key': '/path/to/some/dir/',
            }, {
                'key': '/path/to/some/other/dir/',
            }, {
                'key': '/path/to/not/same/other/dir/',
            }, {
                'key': '/path/to/a/dir/',
            }])

        directories = self.index.photos_list_directories_in_directory(
            domain='example.com',
            captured_at=capture,
            directory='/path/to/',
            refresh_rate=refresh.Hourly)

        self.assertEqual(['some', 'not', 'a'], directories)
        self.index.es.search.assert_called_with(
            index='photos',
            size=0,
            body={
                'query': {
                    'bool': {
                        'must': [{
                            'term': {
                                'domain': 'example.com',
                            }
                        }, {
                            'term': {
                                'refresh_rate': format,
                            }
                        }, {
                            'term': {
                                'captured_at': capture,
                            }
                        }, {
                            'wildcard': {
                                'directory': '/path/to/*',
                            }
                        }],
                    }
                },
                'aggs': {
                    'photos': {
                        'terms': {
                            'field': 'directory',
                            'size': 10000
                        }
                    }
                }
            })