Пример #1
0
 def _crawl(self) -> ImageCollection:
     images = ImageCollection()
     config = self.get_config()
     images.add(Image(
         uri=config["image_uri"],
         source=config["image_uri"],
         is_generic=True,
         this_is_a_dummy=True,
     ))
     return images
Пример #2
0
 def _crawl(self) -> ImageCollection:
     images = ImageCollection()
     for _ in range(self._BUNCH):
         images.add(
             Image(
                 uri=self._get_image_uri(**self._config),
                 source='https://dummyimage.com/',
                 is_generic=True,
             ))
     return images
Пример #3
0
 def _crawl(self) -> ImageCollection:
     images = ImageCollection()
     config = self.get_config()
     for _ in range(0, self._bunch):
         uri = self._get_image_uri(**config)
         images.add(Image(
             uri=uri,
             source=uri,
             is_generic=True,
         ))
     return images
Пример #4
0
 def _crawl(self) -> ImageCollection:
     images = ImageCollection()
     query_uri = self._get_query_uri(
         self._get_query_hash(), self._amount, self._cursor, **self._get_query_variables())
     response = self._query(query_uri)
     for edge in response['edges']:
         images.update(self.__class__._get_images_from_media_edge_node(edge['node']))
         del edge
     page_info = response['page_info']  # type: Dict[str, Any]
     # don't care if this was the last page ... why not restarting at front when the end is reached?!
     self._cursor = page_info['end_cursor'] if page_info['has_next_page'] else None
     return images
Пример #5
0
 def test_image(self) -> None:
     # arrange
     node = dict(
         is_video=False,
         display_url='display_url',
         shortcode='shortcode',
     )
     images_expected = ImageCollection()
     images_expected.add(
         Image(
             uri='display_url',
             source='https://instagram.com/p/shortcode/',
         ))
     # act & assert
     self.__test(node, images_expected)
Пример #6
0
 def _crawl(self) -> ImageCollection:
     images = ImageCollection()
     listing_string, uri = self._remote_fetcher.get_string(
         self._get_uri(self._after))
     listing = json_loads(listing_string)
     del listing_string  # free up some ram
     for child in listing['data']['children']:
         image = self._get_image(child['data'])
         if image:
             images.add(
                 Image(
                     uri=image,
                     source=urljoin(uri, child['data']['permalink']),
                 ))
     # don't care if `after` is `None` after the crawl ... why not restarting at front when the end is reached?!
     self._after = listing['data']['after']
     return images
Пример #7
0
 def test_video(self) -> None:
     # arrange
     node = dict(
         is_video=True,
         display_url='display_url',
         shortcode='shortcode',
     )
     images_expected = ImageCollection()
     # act & assert
     self.__test(node, images_expected)
Пример #8
0
 def test_image_sidecar_video(self) -> None:
     # arrange
     node = dict(
         is_video=False,
         display_url='display_url',
         shortcode='shortcode',
         edge_sidecar_to_children=dict(edges=[
             dict(node=dict(
                 is_video=True,
                 display_url='side_display_url',
                 shortcode='side_shortcode',
             ))
         ]),
     )
     images_expected = ImageCollection()
     images_expected.add(
         Image(
             uri='display_url',
             source='https://instagram.com/p/shortcode/',
         ))
     # act & assert
     self.__test(node, images_expected)
Пример #9
0
 def _crawl(self) -> ImageCollection:
     images = ImageCollection()
     promoted = self._config['promoted']
     api_uri = self._get_api_uri(flags=1,
                                 promoted=promoted,
                                 tags=self._config.get('tags', None),
                                 older=self._older)
     response_raw, api_uri = self._remote_fetcher.get_string(api_uri)
     response = json_loads(response_raw)
     for item in response['items']:
         images.add(
             Image(
                 uri=urljoin(self.__IMG_BASE_URL, str(item['image'])),
                 source=urljoin(self.__POST_BASE_URL, str(item['id'])),
                 width=item.get('width'),
                 height=item.get('height'),
             ))
     if response['atEnd']:
         self.reset()
     else:
         self._older = response['items'][-1][
             'promoted' if promoted else 'id'] or None
     return images
Пример #10
0
 def _get_images_from_media_edge_node(cls, node: Dict[str, Any]) -> ImageCollection:
     images = ImageCollection()
     if not node['is_video']:
         source = cls._get_post_url(node['shortcode'])
         images.add(Image(
             uri=node['display_url'],
             source=source,
             dimensions=node.get('dimensions'),
         ))
         for side_edge in node.get('edge_sidecar_to_children', dict(edges=[]))['edges']:
             if not side_edge['node']['is_video']:
                 images.add(Image(
                     uri=side_edge['node']['display_url'],
                     source=source,
                     dimensions=side_edge['node'].get('dimensions'),
                 ))
             del side_edge
     return images
Пример #11
0
 def test_crawl_images(self) -> None:
     # arrange
     crawler = Pr0gramm(flags=1, promoted=True, tags='!s:15000')
     crawler._remote_fetcher = _FILE_FETCHER
     expected_images = ImageCollection()
     expected_images.add(
         Image(
             uri='https://img.pr0gramm.com/2018/10/11/ac41a1fbcc3abf09.png',
             source='https://pr0gramm.com/new/2782197',
         ))
     expected_images.add(
         Image(
             uri='https://img.pr0gramm.com/2015/07/05/5624d30ec6e743b2.png',
             source='https://pr0gramm.com/new/879293',
         ))
     # act
     images = crawler._crawl()
     # assert
     self.assertSetEqual(images, expected_images)
     for expected_image in expected_images:
         for image in images:
             if image == expected_image:
                 # sources are irrelevant for equality, need to be checked manually
                 self.assertEqual(image.source, expected_image.source)
Пример #12
0
    def test__crawl(self) -> None:
        # arrange
        self.crawler._get_query_hash = lambda: self.__class__._QUERY_HASH  # type: ignore
        self.crawler._get_profile_id = self._get_profile_id  # type: ignore
        expected_images = ImageCollection()
        expected_cursor = 'QVFBbjdTc0dOQ2JQTW1vb1JzMTQxeGpkMEFnTzhYWmh5dFRfMXRWT1VwX28' \
                          'wMUxkSExpZ2s5SVZfWmM5VWtjYUJrTS0wTW5Va2JqSEpTSUpPcENnN1g1OQ=='
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/75467914_150782276185354_'
                '1270489924400076442_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&'
                '_nc_cat=1&oh=f293b2a234c82263dfd37b3785e19625&oe=5E812486',
                source='https://www.instagram.com/p/B5GWlkfjgWZ/'))
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/75467914_150782276185354_'
                '1270489924400076442_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&'
                'oh=f293b2a234c82263dfd37b3785e19625&oe=5E812486',
                source='https://www.instagram.com/p/B5GWlkfjgWZ/'))
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/75349300_522545705143892_'
                '7892885809773901918_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&'
                'oh=1d2f00065294117027b17a585b7d05ab&oe=5E4E791D',
                source='https://www.instagram.com/p/B5GWlkfjgWZ/'))
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/73512651_760344041101107_'
                '8305449940174698639_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&'
                '_nc_cat=1&oh=cc60db8291b59c26af119139ec130edf&oe=5E6B47D3',
                source='https://www.instagram.com/p/B5GWlkfjgWZ/'))
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/75349296'
                '_565205657598924_1414110584006258216_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&'
                '_nc_cat=1&oh=5cd52e00ce5b9571013820950d32a9db&oe=5E4E8A5F',
                source='https://www.instagram.com/p/B5GCEIsj-Wl/'))
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/75349296_'
                '565205657598924_1414110584006258216_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&'
                '_nc_cat=1&oh=5cd52e00ce5b9571013820950d32a9db&oe=5E4E8A5F',
                source='https://www.instagram.com/p/B5GCEIsj-Wl/'))
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/70023824_'
                '3272578002812323_8917281619820840144_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&'
                '_nc_cat=1&oh=46773eb8dbebc8db8a5acf07e0d9ee94&oe=5E539F29',
                source='https://www.instagram.com/p/B5GCEIsj-Wl/'))
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/73480790_830405794055849_'
                '4155404398603377777_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&'
                'oh=5ab282415040be2108c1e0e5fadf8a2a&oe=5E50079A',
                source='https://www.instagram.com/p/B5GCEIsj-Wl/'))
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/74350656_146186010045632_'
                '5113331273863195582_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&'
                'oh=67f218bed182ad8bc7ff9118b2d88139&oe=5E5664A6',
                source='https://www.instagram.com/p/B5GCEIsj-Wl/'))
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/75375719_164423041289362_'
                '4914559208372349272_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&'
                'oh=b401628c44cf7155f6e7963872782306&oe=5E52153E',
                source='https://www.instagram.com/p/B5GCEIsj-Wl/'))
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/e35/s1080x1080/75538167_2161689137265601_'
                '3450507258498841854_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&'
                '_nc_cat=1&oh=484ebc5923e3755f4b88a9cde4d01e37&oe=5E6CE284',
                source='https://www.instagram.com/p/B5GCEIsj-Wl/'))
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/fr/e15/s1080x1080/73401893_805717106534521_'
                '4743540237997542732_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&'
                'oh=a036d5fbe716cd7b11d313200e5ba73d&oe=5E538C0A',
                source='https://www.instagram.com/p/B5FY2u9j8nG/'))
        expected_images.add(
            Image(
                uri=
                'https://scontent-frt3-1.cdninstagram.com/v/t51.2885-15/fr/e15/s1080x1080/74600036_170329604158818_'
                '2739654930228765968_n.jpg?_nc_ht=scontent-frt3-1.cdninstagram.com&_nc_cat=1&'
                'oh=84732a7538ab59d3bd1458d400e181fa&oe=5E86FC97',
                source='https://www.instagram.com/p/B5FEWeHAT-M/'))

        # act
        images = self.crawler._crawl()
        # assert
        self.assertEqual(self.crawler._cursor, expected_cursor)
        self.assertSetEqual(images, expected_images)
        for expected_image in expected_images:
            for image in images:
                if image == expected_image:
                    # sources are irrelevant for equality, need to be checked manually
                    self.assertEqual(image.source, expected_image.source)
Пример #13
0
    def test__crawl(self) -> None:
        # arrange
        self.crawler._get_query_hash = lambda: self.__class__._QUERY_HASH  # type: ignore
        expected_images = ImageCollection()
        expected_cursor = 'QVFDdV9PUXYxc0hjcU9TYUI5dWZZWmNsOGdSaUsxcU9oUHg5endkc2hiUnV' \
                          'CZVVDZWFUM2QzdlVvSnN0Z053Q2ItQkxvSGRObm1hdlR5X3dDZ1JKMWduRg=='

        expected_images.add(
            Image(
                source='https://www.instagram.com/p/B5F7QoklRxu/',
                uri=
                'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/75397747_'
                '403992793822822_8324298994393298267_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com'
                '&_nc_cat=110&oh=f3e9d48ba846ef1d7371e450ab8099e7&oe=5E567BF8')
        )
        expected_images.add(
            Image(
                source='https://www.instagram.com/p/B5F68piFIAs/',
                uri=
                'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/72281363_241905656781018_'
                '6014571893708900461_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com'
                '&_nc_cat=106&oh=8b66604749764befc12a2565a2485649&oe=5E6C6C16')
        )
        expected_images.add(
            Image(
                source='https://www.instagram.com/p/B5F4v16HTM5/',
                uri=
                'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/70998485_152987982771030_'
                '3346050114145538962_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com'
                '&_nc_cat=106&oh=5cf81cd1f81acf8565656bd4af311717&oe=5E6FC7D5')
        )
        expected_images.add(
            Image(
                source='https://www.instagram.com/p/B5F4qHBHNrD/',
                uri=
                'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/p1080x1080/74862751_'
                '439529453427565_562257601125925561_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com'
                '&_nc_cat=101&oh=4ae866a0b55b092ef9868e74f19b5623&oe=5E8BA669')
        )
        expected_images.add(
            Image(
                source='https://www.instagram.com/p/B5F4e14AO9k/',
                uri=
                'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/73480811_2530202687056717'
                '_5816769842868095145_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com'
                '&_nc_cat=101&oh=2515a15280e2d1602f672593c9f0da29&oe=5E5116A9')
        )
        expected_images.add(
            Image(
                source='https://www.instagram.com/p/B5F27k1BxQG/',
                uri=
                'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/71318465_578947536241820_'
                '8770680984207687503_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com&_nc_cat=108&'
                'oh=8612bc5a2e5ad265d885dbeb5861741c&oe=5E81D4BE'))
        expected_images.add(
            Image(
                source='https://www.instagram.com/p/B5Fz_oqDbrY/',
                uri=
                'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/73171316_154443005781309_'
                '5952715908442524817_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com&_nc_cat=109'
                '&oh=61633aa07eb05b103b799f65a21bd60a&oe=5E6D0DFF'))
        expected_images.add(
            Image(
                source='https://www.instagram.com/p/B5Fzi5oHY-O/',
                uri=
                'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/74600030_100916318024628_'
                '8288469442916469948_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com&_nc_cat=100'
                '&oh=a8d237a032264976286879a581c0d904&oe=5E4FD98D'))
        expected_images.add(
            Image(
                source='https://www.instagram.com/p/B5Fu-2CgcoG/',
                uri=
                'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/p1080x1080/77151874_'
                '149403059703505_1673985012081769188_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com'
                '&_nc_cat=101&oh=496e626094a5dc32b60eea9f5785a814&oe=5E4D8D42')
        )
        expected_images.add(
            Image(
                source='https://www.instagram.com/p/B5FssMBFyUL/',
                uri=
                'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/73381083_998377233839407_'
                '3873966681762746452_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com&_nc_cat=107'
                '&oh=651b7a89927a465d758679c599c9aa18&oe=5E687FE2'))
        expected_images.add(
            Image(
                source='https://www.instagram.com/p/B5FpfZ2B4t1/',
                uri=
                'https://scontent-frx5-1.cdninstagram.com/v/t51.2885-15/e35/73300147_1391854841147986_'
                '2913574931708576695_n.jpg?_nc_ht=scontent-frx5-1.cdninstagram.com&_nc_cat=110'
                '&oh=18800daa9416377ed6b870acc0502e2a&oe=5E6E8495'))
        # act
        images = self.crawler._crawl()
        # assert
        self.assertEqual(self.crawler._cursor, expected_cursor)
        self.assertSetEqual(images, expected_images)
        for expected_image in expected_images:
            for image in images:
                if image == expected_image:
                    # sources are irrelevant for equality, need to be checked manually
                    self.assertEqual(image.source, expected_image.source)
Пример #14
0
 def test_crawl(self) -> None:
     # arrange
     expected_after = 't3_dqx42l'
     expected_images = ImageCollection()
     expected_images.add(
         Image(
             uri='https://i.redd.it/kl3dp9sy5fw31.jpg',
             source=
             'https://www.reddit.com/r/aww/comments/dqx0z4/a_very_photogenic_noodle/'
         ))
     expected_images.add(
         Image(uri='https://i.redd.it/4ltnvj5irdw31.jpg',
               source='https://www.reddit.com/r/aww/comments/dqud6w/3/'))
     expected_images.add(
         Image(
             uri='https://i.redd.it/nkfjoej8yew31.png',
             source=
             'https://www.reddit.com/r/aww/comments/dqwp8l/left_the_house_for_10_minutes_and_came_back_to/'
         ))
     expected_images.add(
         Image(uri='https://i.redd.it/gcxqswv8igw31.png',
               source=
               'https://www.reddit.com/r/aww/comments/dqz6iz/blind_cutie/'))
     expected_images.add(
         Image(
             uri='https://i.redd.it/hywobahj9ew31.png',
             source=
             'https://www.reddit.com/r/aww/comments/dqvgm9/i_asked_this_guy_if_he_knocked_over_the_treats/'
         ))
     expected_images.add(
         Image(
             uri='https://i.redd.it/j4qda3c9scw31.jpg',
             source=
             'https://www.reddit.com/r/aww/comments/dqrxiq/admiral_anchovies_is_two_weeks_old_and_has/'
         ))
     expected_images.add(
         Image(
             uri='https://i.imgur.com/O2bVClA.jpg',
             source=
             'https://www.reddit.com/r/aww/comments/dqsk7y/meet_our_new_3_month_old_baby_bucko_the_beagle/',
         ))
     expected_images.add(
         Image(
             uri='https://imgur.com/82LxoWj.jpg',
             source=
             'https://www.reddit.com/r/aww/comments/dqtdo7/im_one_of_a_kind/',
         ))
     # act
     images = self.crawler._crawl()
     # assert
     self.assertEqual(self.crawler._after, expected_after)
     self.assertSetEqual(images, expected_images)
     for expected_image in expected_images:
         for image in images:
             if image == expected_image:
                 # sources are irrelevant for equality, need to be checked manually
                 self.assertEqual(image.source, expected_image.source)