示例#1
0
    def get_galleries_from_page_links(self, page_links: Iterable[str], page_links_results: List[DataDict]) -> None:

        api_page_links = []

        for page_link in page_links:

            m = re.search(r'(.+)/s/(\w+)/(\d+)-(\d+)', page_link)
            if not m:
                continue
            api_page_links.append(
                {'data': [m.group(3), m.group(2), m.group(4)]})

        api_page_links_chunks = list(chunks(api_page_links, 25))

        for i, group in enumerate(api_page_links_chunks):

            if i % 3 == 2:
                time.sleep(self.settings.wait_timer)

            data = {
                'method': 'gtoken',
                'pagelist': [x['data'] for x in group]}

            headers = {'Content-Type': 'application/json'}

            response = request_with_retries(
                constants.ge_api_url,
                {
                    'data': json.dumps(data),
                    'headers': {**headers, **self.settings.requests_headers},
                    'timeout': self.settings.timeout_timer
                },
                post=True,
                logger=self.logger
            )

            if not response:
                continue
            try:
                response_data = response.json()
            except(ValueError, KeyError):
                self.logger.error("Error parsing response to JSON: {}".format(response.text))
                continue

            for gid_token_pair in response_data['tokenlist']:

                discard_approved, discard_message = self.discard_gallery_by_internal_checks(
                    gallery_id=gid_token_pair['gid'],
                    link=link_from_gid_token_fjord(gid_token_pair['gid'], gid_token_pair['token'], False)
                )

                if discard_approved:
                    if not self.settings.silent_processing:
                        self.logger.info(discard_message)
                    continue

                page_links_results.append(
                    {'data': (gid_token_pair['gid'], gid_token_pair['token']),
                     'link': link_from_gid_token_fjord(gid_token_pair['gid'], gid_token_pair['token'], False)})
示例#2
0
    def get_galleries_from_xml(self,
                               url_group: Iterable[str]) -> list[GalleryData]:

        possible_gallery_ids = [
            self.id_from_url(gallery_url) for gallery_url in url_group
        ]

        galleries_ids = [
            gallery_id.replace('mugi-B', 'B')
            for gallery_id in possible_gallery_ids if gallery_id
        ]

        galleries = list()

        gallery_chunks = list(chunks(galleries_ids, 25))

        for i, group in enumerate(gallery_chunks):
            logger.info(
                "Calling API ({}). Gallery group: {}, galleries in group: {}, total groups: {}"
                .format(self.name, i + 1, len(group), len(gallery_chunks)))

            # API doesn't say anything about needing to wait between requests, but we wait just in case.
            if i > 0:
                time.sleep(self.own_settings.wait_timer)

            link = constants.main_page + '/api/' + self.own_settings.api_key + '/?S=getID&ID=' + ",".join(
                galleries_ids)

            request_dict = construct_request_dict(self.settings,
                                                  self.own_settings)

            response = request_with_retries(
                link,
                request_dict,
                post=False,
            )

            if not response:
                continue

            response.encoding = 'utf-8'
            api_galleries = convert_api_response_text_to_gallery_dicts(
                response.text)

            if not api_galleries:
                continue
            galleries.extend(api_galleries)

        return galleries
示例#3
0
    def crawl_json(self,
                   json_string: str,
                   wanted_filters: QuerySet = None,
                   wanted_only: bool = False) -> None:

        if not self.settings.gallery_model:
            return

        dict_list = []
        json_decoded = json.loads(json_string)

        if type(json_decoded) == dict:
            dict_list.append(json_decoded)
        elif type(json_decoded) == list:
            dict_list = json_decoded

        galleries_gids = []
        found_galleries = set()
        total_galleries_filtered: List[GalleryData] = []
        gallery_wanted_lists: Dict[str,
                                   List['WantedGallery']] = defaultdict(list)

        for gallery in dict_list:
            galleries_gids.append(gallery['gid'])
            gallery['posted'] = datetime.fromtimestamp(int(gallery['posted']),
                                                       timezone.utc)
            gallery_data = GalleryData(**gallery)
            total_galleries_filtered.append(gallery_data)

        for galleries_gid_group in list(chunks(galleries_gids, 900)):
            for found_gallery in self.settings.gallery_model.objects.filter(
                    gid__in=galleries_gid_group):
                discard_approved, discard_message = self.discard_gallery_by_internal_checks(
                    gallery=found_gallery, link=found_gallery.get_link())

                if discard_approved:
                    self.logger.info(discard_message)
                    found_galleries.add(found_gallery.gid)

        for count, gallery in enumerate(total_galleries_filtered):

            if gallery.gid in found_galleries:
                continue

            if self.general_utils.discard_by_tag_list(gallery.tags):
                self.logger.info(
                    "Gallery {} of {}: Skipping gallery {}, because it's tagged with global discarded tags"
                    .format(count, len(total_galleries_filtered),
                            gallery.title))
                continue

            if wanted_filters:
                self.compare_gallery_with_wanted_filters(
                    gallery, gallery.link, wanted_filters,
                    gallery_wanted_lists)
                if wanted_only and not gallery_wanted_lists[gallery.gid]:
                    continue

            self.logger.info(
                "Gallery {} of {}:  Gallery {} will be processed.".format(
                    count, len(total_galleries_filtered), gallery.title))

            if gallery.thumbnail:
                original_thumbnail_url = gallery.thumbnail_url

                gallery.thumbnail_url = gallery.thumbnail

                gallery_instance = self.settings.gallery_model.objects.update_or_create_from_values(
                    gallery)

                gallery_instance.thumbnail_url = original_thumbnail_url

                gallery_instance.save()
            else:
                self.settings.gallery_model.objects.update_or_create_from_values(
                    gallery)
示例#4
0
    def crawl_urls(self,
                   urls: List[str],
                   wanted_filters: QuerySet = None,
                   wanted_only: bool = False) -> None:

        unique_urls = set()
        gallery_data_list = []
        fetch_format_galleries: List[DataDict] = []
        unique_page_urls = set()
        gallery_wanted_lists: Dict[str,
                                   List['WantedGallery']] = defaultdict(list)

        if not self.downloaders:
            self.logger.warning('No downloaders enabled, returning.')
            return

        for url in urls:

            if constants.rss_url in url:
                feed_links = self.crawl_feed(url)
                unique_urls.update(feed_links)
                self.logger.info(
                    "Provided RSS URL for provider ({}), adding {} found links"
                    .format(self.name, len(feed_links)))
                continue

            if (constants.ex_page_short not in url
                    and constants.ge_page_short not in url):
                self.logger.warning("Invalid URL, skipping: {}".format(url))
                continue

            if '/g/' in url:
                if not self.settings.silent_processing:
                    self.logger.info(
                        "Provided URL {} is a gallery link, adding".format(
                            url))
                unique_urls.add(url)
                continue

            if '/s/' in url:
                if not self.settings.silent_processing:
                    self.logger.info(
                        "Provided URL {} is a page link, adding".format(url))
                unique_page_urls.add(url)
                continue

            # Do not crawl main page links if they were submitted anonymously, to prevent spam.
            if len(self.downloaders
                   ) == 1 and self.downloaders[0][0].type == 'submit':
                continue

            # assuming main page URLs
            unique_urls.update(self.get_galleries_from_main_page_link(url))

        gallery_ids = []
        found_galleries = set()
        total_galleries_filtered = []
        for gallery_url in unique_urls:

            m = re.search(r'(.+)/g/(\d+)/(\w+)', gallery_url)
            if m:
                gallery_ids.append(m.group(2))
                total_galleries_filtered.append(
                    (gallery_url, m.group(1), m.group(2), m.group(3)))

        for galleries_gid_group in list(chunks(gallery_ids, 900)):
            for found_gallery in Gallery.objects.filter(
                    gid__in=galleries_gid_group):
                discard_approved, discard_message = self.discard_gallery_by_internal_checks(
                    gallery=found_gallery, link=found_gallery.get_link())

                if discard_approved:
                    if not self.settings.silent_processing:
                        self.logger.info(discard_message)
                    found_galleries.add(found_gallery.gid)

        for gallery_tuple in total_galleries_filtered:

            if gallery_tuple[2] not in found_galleries:
                fetch_format_galleries.append({
                    'data': (gallery_tuple[2], gallery_tuple[3]),
                    'root':
                    gallery_tuple[1],
                    'link':
                    gallery_tuple[0]
                })
                if not self.settings.silent_processing:
                    self.logger.info("Gallery {} will be processed. "
                                     "Total galleries: {}".format(
                                         gallery_tuple[0],
                                         len(fetch_format_galleries)))

        if len(unique_page_urls) > 0:
            self.logger.info("Getting gallery links from page links...")
            page_links_results: List[DataDict] = []
            self.get_galleries_from_page_links(unique_page_urls,
                                               page_links_results)
            fetch_format_galleries += page_links_results

        if len(fetch_format_galleries) == 0:
            self.logger.info("No galleries need downloading, returning.")
            return

        fetch_format_galleries_chunks = list(chunks(fetch_format_galleries,
                                                    25))
        fjord_galleries = []
        for i, group in enumerate(fetch_format_galleries_chunks):
            # Set based on recommendation in official documentation
            if i % 3 == 2:
                time.sleep(self.settings.wait_timer)
            if not self.settings.silent_processing:
                self.logger.info(
                    "Calling non-fjord API ({}). "
                    "Gallery group: {}, galleries in group: {}, total groups: {}"
                    .format(self.name, i + 1, len(group),
                            len(fetch_format_galleries_chunks)))

            data = utilities.request_data_from_gid_token_iterable(
                [x['data'] for x in group])

            headers = {'Content-Type': 'application/json'}

            response = request_with_retries(
                constants.ge_api_url, {
                    'data': json.dumps(data),
                    'headers': {
                        **headers,
                        **self.settings.requests_headers
                    },
                    'timeout': self.settings.timeout_timer
                },
                post=True,
                logger=self.logger)

            if not response:
                continue

            try:
                response_data = response.json()
            except (ValueError, KeyError):
                self.logger.error("Error parsing response to JSON: {}".format(
                    response.text))
                continue

            for gallery_data in response_data['gmetadata']:
                if 'error' in gallery_data:
                    self.logger.error("Adding gallery {}: "
                                      "failed with error: {}".format(
                                          gallery_data['gid'],
                                          gallery_data['error']))
                    continue
                internal_gallery_data = map_external_gallery_data_to_internal(
                    gallery_data)
                link = link_from_gid_token_fjord(gallery_data['gid'],
                                                 gallery_data['token'], False)

                if self.general_utils.discard_by_tag_list(
                        internal_gallery_data.tags):
                    if not self.settings.silent_processing:
                        self.logger.info(
                            "Skipping gallery {}, because it's tagged with global discarded tags"
                            .format(link))
                    continue

                if wanted_filters:
                    self.compare_gallery_with_wanted_filters(
                        internal_gallery_data, link, wanted_filters,
                        gallery_wanted_lists)
                    if wanted_only and not gallery_wanted_lists[
                            internal_gallery_data.gid]:
                        continue

                m = re.search(constants.default_fjord_tags,
                              ",".join(internal_gallery_data.tags))

                if m and self.own_settings.cookies:
                    fjord_galleries.append(
                        link_from_gid_token_fjord(gallery_data['gid'],
                                                  gallery_data['token'], True))
                else:
                    gallery_data_list.append(internal_gallery_data)

        fjord_galleries_data = self.fetch_multiple_gallery_data(
            fjord_galleries)

        if fjord_galleries_data:
            gallery_data_list.extend(fjord_galleries_data)

        self.pass_gallery_data_to_downloaders(gallery_data_list,
                                              gallery_wanted_lists)
示例#5
0
    def get_values_from_gallery_link_list(
            self, url_list: Iterable[str]) -> List[GalleryData]:

        gid_token_chunks = list(
            chunks([get_gid_token_from_link(link) for link in url_list], 25))

        galleries_data = []

        for i, group in enumerate(gid_token_chunks):

            if i % 3 == 2:
                time.sleep(self.settings.wait_timer)
            if not self.settings.silent_processing:
                self.logger.info(
                    "Calling fjord API ({}). "
                    "Gallery group: {}, galleries in group: {}, total groups: {}"
                    .format(self.name, i + 1, len(group),
                            len(gid_token_chunks)))

            data = utilities.request_data_from_gid_token_iterable(group)

            headers = {'Content-Type': 'application/json'}

            response = request_with_retries(
                constants.ex_api_url, {
                    'data': json.dumps(data),
                    'headers': {
                        **headers,
                        **self.settings.requests_headers
                    },
                    'cookies': self.own_settings.cookies,
                    'timeout': self.settings.timeout_timer
                },
                post=True,
                logger=self.logger)

            if not response:
                continue

            try:
                response_data = response.json()
            except (ValueError, KeyError):
                self.logger.error("Error parsing response to JSON: {}".format(
                    response.text))
                continue

            for gallery_data in response_data['gmetadata']:
                if 'error' in gallery_data:
                    self.logger.error("Fetching gallery {}: "
                                      "failed with error: {}".format(
                                          gallery_data['gid'],
                                          gallery_data['error']))
                    continue
                internal_gallery_data = map_external_gallery_data_to_internal(
                    gallery_data)
                m = re.search(constants.default_fjord_tags,
                              ",".join(internal_gallery_data.tags))
                if m:
                    internal_gallery_data.fjord = True
                else:
                    internal_gallery_data.fjord = False
                galleries_data.append(internal_gallery_data)

        return galleries_data
示例#6
0
    def get_values_from_gallery_link_list(self, url_list: Iterable[str], use_fjord: bool = False) -> list[GalleryData]:

        gid_token_chunks = list(chunks([get_gid_token_from_link(link) for link in url_list], 25))

        galleries_data = []

        if self.own_settings.cookies and use_fjord:
            api_page = constants.ex_api_url
        else:
            api_page = constants.ge_api_url

        for i, group in enumerate(gid_token_chunks):

            if i % 3 == 2:
                time.sleep(self.own_settings.wait_timer)
            if not self.settings.silent_processing:
                logger.info(
                    "Calling API ({}), URL: {}. "
                    "Gallery group: {}, galleries in group: {}, total groups: {}".format(
                        self.name,
                        api_page,
                        i + 1,
                        len(group),
                        len(gid_token_chunks)
                    )
                )

            data = utilities.request_data_from_gid_token_iterable(group)

            headers = {'Content-Type': 'application/json'}

            request_dict = construct_request_dict(self.settings, self.own_settings)
            request_dict['headers'] = {**headers, **self.settings.requests_headers}
            request_dict['data'] = json.dumps(data)

            response = request_with_retries(
                api_page,
                request_dict,
                post=True,
            )

            if not response:
                continue

            try:
                response_data = response.json()
            except(ValueError, KeyError):
                logger.error("Could not parse response to JSON: {}".format(response.text))
                continue

            for gallery_data in response_data['gmetadata']:
                if 'error' in gallery_data:
                    logger.error(
                        "Fetching gallery {}: "
                        "failed with error: {}".format(gallery_data['gid'], gallery_data['error'])
                    )
                    continue
                internal_gallery_data = map_external_gallery_data_to_internal(gallery_data)
                if use_fjord and internal_gallery_data.fjord:
                    internal_gallery_data.root = constants.ex_page
                    internal_gallery_data.link = link_from_gid_token_fjord(
                        gallery_data['gid'], gallery_data['token'], True
                    )
                else:
                    internal_gallery_data.root = constants.ge_page
                    internal_gallery_data.link = link_from_gid_token_fjord(
                        gallery_data['gid'], gallery_data['token'], False
                    )
                galleries_data.append(internal_gallery_data)

        return galleries_data
示例#7
0
    def crawl_urls(self,
                   urls: list[str],
                   wanted_filters=None,
                   wanted_only: bool = False) -> None:

        for url in urls:

            dict_list = []
            request_dict = construct_request_dict(self.settings,
                                                  self.own_settings)

            if '/archive/' in url:
                match_archive_pk = re.search(r'/archive/(\d+)/', url)
                if match_archive_pk:
                    api_url = urljoin(self.own_settings.url,
                                      constants.api_path)
                    request_dict['params'] = {
                        'archive': match_archive_pk.group(1)
                    }
                    archive_response = request_with_retries(
                        api_url,
                        request_dict,
                        post=False,
                    )
                    if not archive_response:
                        logger.error(
                            "Did not get a response from URL: {}".format(
                                api_url))
                        continue
                    try:
                        json_decoded = archive_response.json()
                    except (ValueError, KeyError):
                        logger.error(
                            "Could not parse response to JSON: {}".format(
                                archive_response.text))
                        continue
                    if json_decoded['gallery']:
                        request_dict['params'] = {
                            'gd': json_decoded['gallery']
                        }
                        gallery_response = request_with_retries(
                            api_url,
                            request_dict,
                            post=False,
                        )
                        if not gallery_response:
                            logger.error(
                                "Did not get a response from URL: {}".format(
                                    api_url))
                            continue
                        try:
                            json_decoded = gallery_response.json()
                            dict_list.append(json_decoded)
                        except (ValueError, KeyError):
                            logger.error(
                                "Could not parse response to JSON: {}".format(
                                    gallery_response.text))
                            continue
                    else:
                        logger.error(
                            "Archive: {} does not have an associated Gallery".
                            format(url))
                        continue
            elif '/gallery/' in url:
                match_gallery_pk = re.search(r'/gallery/(\d+)/', url)
                if match_gallery_pk:
                    api_url = urljoin(self.own_settings.url,
                                      constants.api_path)
                    request_dict['params'] = {'gd': match_gallery_pk.group(1)}
                    gallery_response = request_with_retries(
                        api_url,
                        request_dict,
                        post=False,
                    )
                    if not gallery_response:
                        logger.error(
                            "Did not get a response from URL: {}".format(
                                api_url))
                        continue
                    try:
                        json_decoded = gallery_response.json()
                        dict_list.append(json_decoded)
                    except (ValueError, KeyError):
                        logger.error(
                            "Could not parse response to JSON: {}".format(
                                gallery_response.text))
                        continue

            else:
                response = request_with_retries(
                    url,
                    request_dict,
                    post=False,
                )

                if not response:
                    logger.error(
                        "Did not get a response from URL: {}".format(url))
                    continue

                try:
                    json_decoded = response.json()
                except (ValueError, KeyError):
                    logger.error("Could not parse response to JSON: {}".format(
                        response.text))
                    continue

                if type(json_decoded) == dict:
                    if 'galleries' in json_decoded:
                        dict_list = json_decoded['galleries']
                    else:
                        dict_list.append(json_decoded)
                elif type(json_decoded) == list:
                    dict_list = json_decoded

            galleries_gids = []
            found_galleries = set()
            total_galleries_filtered: list[ChaikaGalleryData] = []
            gallery_wanted_lists: dict[
                str, list['WantedGallery']] = defaultdict(list)

            for gallery in dict_list:
                if 'result' in gallery:
                    continue
                galleries_gids.append(gallery['gid'])
                gallery['posted'] = datetime.fromtimestamp(
                    int(gallery['posted']), timezone.utc)
                gallery_data = ChaikaGalleryData(**gallery)
                total_galleries_filtered.append(gallery_data)

            for galleries_gid_group in list(chunks(galleries_gids, 900)):
                for found_gallery in Gallery.objects.filter(
                        gid__in=galleries_gid_group):
                    discard_approved, discard_message = self.discard_gallery_by_internal_checks(
                        gallery=found_gallery, link=found_gallery.get_link())

                    if discard_approved:
                        logger.info("{} Real GID: {}".format(
                            discard_message, found_gallery.gid))
                        found_galleries.add(found_gallery.gid)

            for count, gallery in enumerate(total_galleries_filtered, start=1):

                if gallery.gid in found_galleries:
                    continue

                discarded_tags = self.general_utils.discard_by_tag_list(
                    gallery.tags)

                if discarded_tags:
                    logger.info(
                        "Skipping gallery link {}, because it's tagged with global discarded tags: {}"
                        .format(gallery.title, discarded_tags))
                    continue

                if wanted_filters:
                    self.compare_gallery_with_wanted_filters(
                        gallery, gallery.link, wanted_filters,
                        gallery_wanted_lists)
                    if wanted_only and not gallery_wanted_lists[gallery.gid]:
                        continue

                logger.info(
                    "Gallery {} of {}: Gallery {} (Real GID: {}) will be processed."
                    .format(count, len(total_galleries_filtered),
                            gallery.title, gallery.gid))

                if gallery.thumbnail:
                    original_thumbnail_url = gallery.thumbnail_url

                    gallery.thumbnail_url = gallery.thumbnail

                    gallery_obj = Gallery.objects.update_or_create_from_values(
                        gallery)

                    gallery_obj.thumbnail_url = original_thumbnail_url

                    gallery_obj.save()
                else:
                    Gallery.objects.update_or_create_from_values(gallery)

                for archive in gallery.archives:
                    gallery.temp_archive = archive
                    self.pass_gallery_data_to_downloaders([gallery],
                                                          gallery_wanted_lists)
示例#8
0
    def crawl_urls(self,
                   urls: List[str],
                   wanted_filters=None,
                   wanted_only: bool = False) -> None:

        for url in urls:
            response = request_with_retries(
                url, {
                    'headers': self.settings.requests_headers,
                    'timeout': self.settings.timeout_timer,
                    'cookies': self.own_settings.cookies
                },
                post=False,
                logger=self.logger)

            dict_list = []

            try:
                json_decoded = response.json()
            except (ValueError, KeyError):
                self.logger.error("Error parsing response to JSON: {}".format(
                    response.text))
                continue

            if type(json_decoded) == dict:
                if 'galleries' in json_decoded:
                    dict_list = json_decoded['galleries']
                else:
                    dict_list.append(json_decoded)
            elif type(json_decoded) == list:
                dict_list = json_decoded

            galleries_gids = []
            found_galleries = set()
            total_galleries_filtered: List[ChaikaGalleryData] = []
            gallery_wanted_lists: Dict[
                str, List['WantedGallery']] = defaultdict(list)

            for gallery in dict_list:
                if 'result' in gallery:
                    continue
                galleries_gids.append(gallery['gid'])
                gallery['posted'] = datetime.fromtimestamp(
                    int(gallery['posted']), timezone.utc)
                gallery_data = ChaikaGalleryData(**gallery)
                total_galleries_filtered.append(gallery_data)

            for galleries_gid_group in list(chunks(galleries_gids, 900)):
                for found_gallery in self.settings.gallery_model.objects.filter(
                        gid__in=galleries_gid_group):
                    discard_approved, discard_message = self.discard_gallery_by_internal_checks(
                        gallery=found_gallery, link=found_gallery.get_link())

                    if discard_approved:
                        self.logger.info(discard_message)
                        found_galleries.add(found_gallery.gid)

            for gallery in total_galleries_filtered:

                if gallery.gid in found_galleries:
                    continue

                if self.general_utils.discard_by_tag_list(gallery.tags):
                    self.logger.info(
                        "Skipping gallery {}, because it's tagged with global discarded tags"
                        .format(gallery.title))
                    continue

                if wanted_filters:
                    self.compare_gallery_with_wanted_filters(
                        gallery, gallery.link, wanted_filters,
                        gallery_wanted_lists)
                    if wanted_only and not gallery_wanted_lists[gallery.gid]:
                        continue

                self.logger.info("Gallery {} will be processed.".format(
                    gallery.title))

                if gallery.thumbnail:
                    original_thumbnail_url = gallery.thumbnail_url

                    gallery.thumbnail_url = gallery.thumbnail

                    gallery_obj = self.settings.gallery_model.objects.update_or_create_from_values(
                        gallery)

                    gallery_obj.thumbnail_url = original_thumbnail_url

                    gallery_obj.save()
                else:
                    self.settings.gallery_model.objects.update_or_create_from_values(
                        gallery)

                for archive in gallery.archives:
                    gallery.archiver_key = archive
                    self.pass_gallery_data_to_downloaders([gallery],
                                                          gallery_wanted_lists)