def _crawl(self) -> ImageCollection: images = ImageCollection() config = self.get_config() images.add(Image( uri=config["image_uri"], source=config["image_uri"], is_generic=True, this_is_a_dummy=True, )) return images
def _crawl(self) -> ImageCollection: images = ImageCollection() for _ in range(self._BUNCH): images.add( Image( uri=self._get_image_uri(**self._config), source='https://dummyimage.com/', is_generic=True, )) return images
def _crawl(self) -> ImageCollection: images = ImageCollection() config = self.get_config() for _ in range(0, self._bunch): uri = self._get_image_uri(**config) images.add(Image( uri=uri, source=uri, is_generic=True, )) return images
def _crawl(self) -> ImageCollection: images = ImageCollection() query_uri = self._get_query_uri( self._get_query_hash(), self._amount, self._cursor, **self._get_query_variables()) response = self._query(query_uri) for edge in response['edges']: images.update(self.__class__._get_images_from_media_edge_node(edge['node'])) del edge page_info = response['page_info'] # type: Dict[str, Any] # don't care if this was the last page ... why not restarting at front when the end is reached?! self._cursor = page_info['end_cursor'] if page_info['has_next_page'] else None return images
def test_image(self) -> None: # arrange node = dict( is_video=False, display_url='display_url', shortcode='shortcode', ) images_expected = ImageCollection() images_expected.add( Image( uri='display_url', source='https://instagram.com/p/shortcode/', )) # act & assert self.__test(node, images_expected)
def _crawl(self) -> ImageCollection: images = ImageCollection() listing_string, uri = self._remote_fetcher.get_string( self._get_uri(self._after)) listing = json_loads(listing_string) del listing_string # free up some ram for child in listing['data']['children']: image = self._get_image(child['data']) if image: images.add( Image( uri=image, source=urljoin(uri, child['data']['permalink']), )) # don't care if `after` is `None` after the crawl ... why not restarting at front when the end is reached?! self._after = listing['data']['after'] return images
def test_video(self) -> None: # arrange node = dict( is_video=True, display_url='display_url', shortcode='shortcode', ) images_expected = ImageCollection() # act & assert self.__test(node, images_expected)
def test_image_sidecar_video(self) -> None: # arrange node = dict( is_video=False, display_url='display_url', shortcode='shortcode', edge_sidecar_to_children=dict(edges=[ dict(node=dict( is_video=True, display_url='side_display_url', shortcode='side_shortcode', )) ]), ) images_expected = ImageCollection() images_expected.add( Image( uri='display_url', source='https://instagram.com/p/shortcode/', )) # act & assert self.__test(node, images_expected)
def _crawl(self) -> ImageCollection: images = ImageCollection() promoted = self._config['promoted'] api_uri = self._get_api_uri(flags=1, promoted=promoted, tags=self._config.get('tags', None), older=self._older) response_raw, api_uri = self._remote_fetcher.get_string(api_uri) response = json_loads(response_raw) for item in response['items']: images.add( Image( uri=urljoin(self.__IMG_BASE_URL, str(item['image'])), source=urljoin(self.__POST_BASE_URL, str(item['id'])), width=item.get('width'), height=item.get('height'), )) if response['atEnd']: self.reset() else: self._older = response['items'][-1][ 'promoted' if promoted else 'id'] or None return images
def _get_images_from_media_edge_node(cls, node: Dict[str, Any]) -> ImageCollection: images = ImageCollection() if not node['is_video']: source = cls._get_post_url(node['shortcode']) images.add(Image( uri=node['display_url'], source=source, dimensions=node.get('dimensions'), )) for side_edge in node.get('edge_sidecar_to_children', dict(edges=[]))['edges']: if not side_edge['node']['is_video']: images.add(Image( uri=side_edge['node']['display_url'], source=source, dimensions=side_edge['node'].get('dimensions'), )) del side_edge return images
def test_crawl_images(self) -> None: # arrange crawler = Pr0gramm(flags=1, promoted=True, tags='!s:15000') crawler._remote_fetcher = _FILE_FETCHER expected_images = ImageCollection() expected_images.add( Image( uri='https://img.pr0gramm.com/2018/10/11/ac41a1fbcc3abf09.png', source='https://pr0gramm.com/new/2782197', )) expected_images.add( Image( uri='https://img.pr0gramm.com/2015/07/05/5624d30ec6e743b2.png', source='https://pr0gramm.com/new/879293', )) # act images = crawler._crawl() # assert self.assertSetEqual(images, expected_images) for expected_image in expected_images: for image in images: if image == expected_image: # sources are irrelevant for equality, need to be checked manually self.assertEqual(image.source, expected_image.source)
def test__crawl(self) -> None: # arrange self.crawler._get_query_hash = lambda: self.__class__._QUERY_HASH # type: ignore self.crawler._get_profile_id = self._get_profile_id # type: ignore expected_images = ImageCollection() expected_cursor = 'QVFBbjdTc0dOQ2JQTW1vb1JzMTQxeGpkMEFnTzhYWmh5dFRfMXRWT1VwX28' \ 'wMUxkSExpZ2s5SVZfWmM5VWtjYUJrTS0wTW5Va2JqSEpTSUpPcENnN1g1OQ==' expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/75467914_150782276185354_' '1270489924400076442_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&' '_nc_cat=1&oh=f293b2a234c82263dfd37b3785e19625&oe=5E812486', source='https://www.instagram.com/p/B5GWlkfjgWZ/')) expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/75467914_150782276185354_' '1270489924400076442_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&' 'oh=f293b2a234c82263dfd37b3785e19625&oe=5E812486', source='https://www.instagram.com/p/B5GWlkfjgWZ/')) expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/75349300_522545705143892_' '7892885809773901918_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&' 'oh=1d2f00065294117027b17a585b7d05ab&oe=5E4E791D', source='https://www.instagram.com/p/B5GWlkfjgWZ/')) expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/73512651_760344041101107_' '8305449940174698639_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&' '_nc_cat=1&oh=cc60db8291b59c26af119139ec130edf&oe=5E6B47D3', source='https://www.instagram.com/p/B5GWlkfjgWZ/')) expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/75349296' '_565205657598924_1414110584006258216_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&' '_nc_cat=1&oh=5cd52e00ce5b9571013820950d32a9db&oe=5E4E8A5F', source='https://www.instagram.com/p/B5GCEIsj-Wl/')) expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/75349296_' '565205657598924_1414110584006258216_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&' '_nc_cat=1&oh=5cd52e00ce5b9571013820950d32a9db&oe=5E4E8A5F', source='https://www.instagram.com/p/B5GCEIsj-Wl/')) expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/70023824_' '3272578002812323_8917281619820840144_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&' '_nc_cat=1&oh=46773eb8dbebc8db8a5acf07e0d9ee94&oe=5E539F29', source='https://www.instagram.com/p/B5GCEIsj-Wl/')) expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/73480790_830405794055849_' '4155404398603377777_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&' 'oh=5ab282415040be2108c1e0e5fadf8a2a&oe=5E50079A', source='https://www.instagram.com/p/B5GCEIsj-Wl/')) expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/74350656_146186010045632_' '5113331273863195582_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&' 'oh=67f218bed182ad8bc7ff9118b2d88139&oe=5E5664A6', source='https://www.instagram.com/p/B5GCEIsj-Wl/')) expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/75375719_164423041289362_' '4914559208372349272_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&' 'oh=b401628c44cf7155f6e7963872782306&oe=5E52153E', source='https://www.instagram.com/p/B5GCEIsj-Wl/')) expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/75538167_2161689137265601_' '3450507258498841854_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&' '_nc_cat=1&oh=484ebc5923e3755f4b88a9cde4d01e37&oe=5E6CE284', source='https://www.instagram.com/p/B5GCEIsj-Wl/')) expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/fr/e15/s1080x1080/73401893_805717106534521_' '4743540237997542732_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&' 'oh=a036d5fbe716cd7b11d313200e5ba73d&oe=5E538C0A', source='https://www.instagram.com/p/B5FY2u9j8nG/')) expected_images.add( Image( uri= 'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/fr/e15/s1080x1080/74600036_170329604158818_' '2739654930228765968_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&' 'oh=84732a7538ab59d3bd1458d400e181fa&oe=5E86FC97', source='https://www.instagram.com/p/B5FEWeHAT-M/')) # act images = self.crawler._crawl() # assert self.assertEqual(self.crawler._cursor, expected_cursor) self.assertSetEqual(images, expected_images) for expected_image in expected_images: for image in images: if image == expected_image: # sources are irrelevant for equality, need to be checked manually self.assertEqual(image.source, expected_image.source)
def test__crawl(self) -> None: # arrange self.crawler._get_query_hash = lambda: self.__class__._QUERY_HASH # type: ignore expected_images = ImageCollection() expected_cursor = 'QVFDdV9PUXYxc0hjcU9TYUI5dWZZWmNsOGdSaUsxcU9oUHg5endkc2hiUnV' \ 'CZVVDZWFUM2QzdlVvSnN0Z053Q2ItQkxvSGRObm1hdlR5X3dDZ1JKMWduRg==' expected_images.add( Image( source='https://www.instagram.com/p/B5F7QoklRxu/', uri= 'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/75397747_' '403992793822822_8324298994393298267_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com' '&_nc_cat=110&oh=f3e9d48ba846ef1d7371e450ab8099e7&oe=5E567BF8') ) expected_images.add( Image( source='https://www.instagram.com/p/B5F68piFIAs/', uri= 'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/72281363_241905656781018_' '6014571893708900461_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com' '&_nc_cat=106&oh=8b66604749764befc12a2565a2485649&oe=5E6C6C16') ) expected_images.add( Image( source='https://www.instagram.com/p/B5F4v16HTM5/', uri= 'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/70998485_152987982771030_' '3346050114145538962_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com' '&_nc_cat=106&oh=5cf81cd1f81acf8565656bd4af311717&oe=5E6FC7D5') ) expected_images.add( Image( source='https://www.instagram.com/p/B5F4qHBHNrD/', uri= 'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/p1080x1080/74862751_' '439529453427565_562257601125925561_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com' '&_nc_cat=101&oh=4ae866a0b55b092ef9868e74f19b5623&oe=5E8BA669') ) expected_images.add( Image( source='https://www.instagram.com/p/B5F4e14AO9k/', uri= 'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/73480811_2530202687056717' '_5816769842868095145_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com' '&_nc_cat=101&oh=2515a15280e2d1602f672593c9f0da29&oe=5E5116A9') ) expected_images.add( Image( source='https://www.instagram.com/p/B5F27k1BxQG/', uri= 'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/71318465_578947536241820_' '8770680984207687503_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com&_nc_cat=108&' 'oh=8612bc5a2e5ad265d885dbeb5861741c&oe=5E81D4BE')) expected_images.add( Image( source='https://www.instagram.com/p/B5Fz_oqDbrY/', uri= 'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/73171316_154443005781309_' '5952715908442524817_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com&_nc_cat=109' '&oh=61633aa07eb05b103b799f65a21bd60a&oe=5E6D0DFF')) expected_images.add( Image( source='https://www.instagram.com/p/B5Fzi5oHY-O/', uri= 'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/74600030_100916318024628_' '8288469442916469948_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com&_nc_cat=100' '&oh=a8d237a032264976286879a581c0d904&oe=5E4FD98D')) expected_images.add( Image( source='https://www.instagram.com/p/B5Fu-2CgcoG/', uri= 'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/p1080x1080/77151874_' '149403059703505_1673985012081769188_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com' '&_nc_cat=101&oh=496e626094a5dc32b60eea9f5785a814&oe=5E4D8D42') ) expected_images.add( Image( source='https://www.instagram.com/p/B5FssMBFyUL/', uri= 'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/73381083_998377233839407_' '3873966681762746452_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com&_nc_cat=107' '&oh=651b7a89927a465d758679c599c9aa18&oe=5E687FE2')) expected_images.add( Image( source='https://www.instagram.com/p/B5FpfZ2B4t1/', uri= 'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/73300147_1391854841147986_' '2913574931708576695_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com&_nc_cat=110' '&oh=18800daa9416377ed6b870acc0502e2a&oe=5E6E8495')) # act images = self.crawler._crawl() # assert self.assertEqual(self.crawler._cursor, expected_cursor) self.assertSetEqual(images, expected_images) for expected_image in expected_images: for image in images: if image == expected_image: # sources are irrelevant for equality, need to be checked manually self.assertEqual(image.source, expected_image.source)
def test_crawl(self) -> None: # arrange expected_after = 't3_dqx42l' expected_images = ImageCollection() expected_images.add( Image( uri='https://i.redd.it/kl3dp9sy5fw31.jpg', source= 'https://www.reddit.com/r/aww/comments/dqx0z4/a_very_photogenic_noodle/' )) expected_images.add( Image(uri='https://i.redd.it/4ltnvj5irdw31.jpg', source='https://www.reddit.com/r/aww/comments/dqud6w/3/')) expected_images.add( Image( uri='https://i.redd.it/nkfjoej8yew31.png', source= 'https://www.reddit.com/r/aww/comments/dqwp8l/left_the_house_for_10_minutes_and_came_back_to/' )) expected_images.add( Image(uri='https://i.redd.it/gcxqswv8igw31.png', source= 'https://www.reddit.com/r/aww/comments/dqz6iz/blind_cutie/')) expected_images.add( Image( uri='https://i.redd.it/hywobahj9ew31.png', source= 'https://www.reddit.com/r/aww/comments/dqvgm9/i_asked_this_guy_if_he_knocked_over_the_treats/' )) expected_images.add( Image( uri='https://i.redd.it/j4qda3c9scw31.jpg', source= 'https://www.reddit.com/r/aww/comments/dqrxiq/admiral_anchovies_is_two_weeks_old_and_has/' )) expected_images.add( Image( uri='https://i.imgur.com/O2bVClA.jpg', source= 'https://www.reddit.com/r/aww/comments/dqsk7y/meet_our_new_3_month_old_baby_bucko_the_beagle/', )) expected_images.add( Image( uri='https://imgur.com/82LxoWj.jpg', source= 'https://www.reddit.com/r/aww/comments/dqtdo7/im_one_of_a_kind/', )) # act images = self.crawler._crawl() # assert self.assertEqual(self.crawler._after, expected_after) self.assertSetEqual(images, expected_images) for expected_image in expected_images: for image in images: if image == expected_image: # sources are irrelevant for equality, need to be checked manually self.assertEqual(image.source, expected_image.source)