def _limited_call(self, func, *args, **kwargs): """Rate limit calls to a function. """ # Check seconds that have passed now = datetime.datetime.now() diff = (now - self._rate_limit_start).total_seconds() if diff >= 60: # If greater than a minute, reset the rate limit self._rate_limit_count = 0 self._rate_limit_start = now else: # Check if the per-minute limit has been exceeded if self._rate_limit_count >= constants.FA_PAGE_REQUESTS_PER_MINUTE: # Wait until next minute, then reset the count/time wait_time = 60 - diff logger.debug("Hit rate limit, waiting %d seconds" % wait_time) time.sleep(wait_time) self._rate_limit_count = 0 self._rate_limit_start = datetime.datetime.now() self._rate_limit_count += 1 return func(*args, **kwargs)
def _load_folders(self): logger.debug("Loading folders") self._root_folders = [] url = constants.WZL_ROOT + "/api/users/%s/view" % self.username res = self._requests.get(url) folders = res.json()["folders"] for folder_struct in folders: folder = self._folders.get(folder_struct["folder_id"]) if folder is None: folder = Folder() folder._session = self folder.id = folder_struct["folder_id"] self._folders[folder.id] = folder folder.title = folder_struct["title"] folder.children = [] self._root_folders.append(folder) if "subfolders" in folder_struct: for subfolder_struct in folder_struct["subfolders"]: subfolder = self._folders.get(subfolder_struct["folder_id"]) if subfolder is None: subfolder = Folder() subfolder._session = self subfolder.id = subfolder_struct["folder_id"] self._folders[subfolder.id] = subfolder subfolder.title = subfolder_struct["title"] subfolder.children = [] folder.children.append(subfolder)
def _scan_folder(self, folder): logger.debug("Scanning folder %r" % folder) url = constants.FA_ROOT + "/gallery/%s/folder/%d/-/%%d/" % ( self.username, folder.id) submissions = self._scan_submission_page(url) folder.submissions = [] for sub in submissions: folder.submissions.append(sub)
def _scan_gallery(self, folder_id=None): next_id = None url = constants.WZL_ROOT + "/api/users/%s/gallery" % self.username submissions = [] logger.debug("Scanning gallery folder %r" % folder_id) while True: params = {} if next_id is not None: params["nextid"] = next_id if folder_id is not None: params["folderid"] = folder_id res = self._requests.get(url, params=params) data = res.json() next_id = data["nextid"] for sub_struct in data["submissions"]: sub = self._load_submission_from_struct(sub_struct) submissions.append(sub) if next_id is None: break logger.debug("Found %d submissions" % len(data["submissions"])) if folder_id is None: self._gallery_submissions = submissions return submissions
def _load_folders(self): logger.debug("Loading folders") self._root_folders = [] url = constants.WZL_ROOT + "/api/users/%s/view" % self.username res = self._requests.get(url) folders = res.json()["folders"] for folder_struct in folders: folder = self._folders.get(folder_struct["folder_id"]) if folder is None: folder = Folder() folder._session = self folder.id = folder_struct["folder_id"] self._folders[folder.id] = folder folder.title = folder_struct["title"] folder.children = [] self._root_folders.append(folder) if "subfolders" in folder_struct: for subfolder_struct in folder_struct["subfolders"]: subfolder = self._folders.get( subfolder_struct["folder_id"]) if subfolder is None: subfolder = Folder() subfolder._session = self subfolder.id = subfolder_struct["folder_id"] self._folders[subfolder.id] = subfolder subfolder.title = subfolder_struct["title"] subfolder.children = [] folder.children.append(subfolder)
def _scan_scraps(self): logger.debug("Scanning scraps") url = constants.FA_ROOT + "/scraps/%s/%%d/" % self.username submissions = self._scan_submission_page(url) return submissions
def _scan_gallery(self): logger.debug("Scanning gallery") url = constants.FA_ROOT + "/gallery/%s/%%d/" % self.username submissions = self._scan_submission_page(url) return submissions
def _scan_submission_page(self, url_format): """Return submissions found in pages of a base url. Args: url_format (str): URL, with a %d that holds the page id Returns: A list of submission objects. """ submissions = [] try: page = 1 while True: url = url_format % page doc = self._limited_call(self._html_get, url) logger.debug("Scanning submissions from %s" % url) count = 0 for el in doc.cssselect(".gallery > *"): if el.get("id") == "no-images": continue id_str = el.get("id")[4:] if id_str == "": continue id = int(id_str) submission = self._submissions.get(id) if submission is None: submission = Submission() submission._session = self submission.id = id self._submissions[id] = submission submission.title = str( el.cssselect("span")[0].text_content()) if "r-adult" in el.classes: submission.rating = "adult" elif "r-mature" in el.classes: submission.rating = "mature" elif "r-general" in el.classes: submission.rating = "general" else: raise exceptions.ScraperError() if "t-image" in el.classes: submission.type = "image" elif "t-text" in el.classes: submission.type = "text" elif "t-audio" in el.classes: submission.type = "audio" elif "t-flash" in el.classes: submission.type = "flash" else: raise exceptions.ScraperError() submission.thumbnail_url = "https:" + el.cssselect( "img")[0].get("src") submissions.append(submission) count += 1 if count == 0: break logger.debug("Found %d submissions" % count) page += 1 except (IndexError, ValueError): raise exceptions.ScraperError() return submissions
def _load_folders(self): logger.debug("Loading folders") self._root_folders = [] url = constants.FA_ROOT + "/controls/folders/submissions/" doc = self._limited_call(self._html_get, url) # get groups for group_el in doc.cssselect(".group-row"): try: title = str(group_el.cssselect("strong")[0].text_content()) id_match = re.search("group-([0-9]+)", group_el.get("class")) id = int(id_match.group(1)) group = self._folders.get(id) if group is None: group = Folder() group._session = self group.id = id self._folders[id] = group group.title = title group.children = [] group.submissions = [] self._root_folders.append(group) except (IndexError, ValueError): raise exceptions.ScraperError() # Get folders for folder_el in doc.cssselect(".folder-row"): try: title = str( folder_el.cssselect(".folder-name strong") [0].text_content()) id_match = re.search("folder-([0-9]+)", folder_el.get("class")) group_match = re.search("group-([0-9]+)", folder_el.get("class")) id = int(id_match.group(1)) parent_id = int(group_match.group(1)) folder = self._folders.get(id) if folder is None: folder = Folder() folder._session = self folder.id = id self._folders[id] = folder folder.title = title folder.children = [] parent = self._folders.get(parent_id) if parent is None: self._root_folders.append(folder) else: parent.children.append(folder) except (IndexError, ValueError): raise exceptions.ScraperError()
def _scan_submission_page(self, url_format): """Return submissions found in pages of a base url. Args: url_format (str): URL, with a %d that holds the page id Returns: A list of submission objects. """ submissions = [] try: page = 1 while True: url = url_format % page doc = self._limited_call(self._html_get, url) logger.debug("Scanning submissions from %s" % url) count = 0 for el in doc.cssselect(".gallery > *"): if el.get("id") == "no-images": continue id_str = el.get("id")[4:] if id_str == "": continue id = int(id_str) submission = self._submissions.get(id) if submission is None: submission = Submission() submission._session = self submission.id = id self._submissions[id] = submission submission.title = str( el.cssselect("span")[0].text_content()) if "r-adult" in el.classes: submission.rating = "adult" elif "r-mature" in el.classes: submission.rating = "mature" elif "r-general" in el.classes: submission.rating = "general" else: raise exceptions.ScraperError() if "t-image" in el.classes: submission.type = "image" elif "t-text" in el.classes: submission.type = "text" elif "t-audio" in el.classes: submission.type = "audio" elif "t-flash" in el.classes: submission.type = "flash" else: raise exceptions.ScraperError() submission.thumbnail_url = "https:" + el.cssselect("img")[ 0].get("src") submissions.append(submission) count += 1 if count == 0: break logger.debug("Found %d submissions" % count) page += 1 except (IndexError, ValueError): raise exceptions.ScraperError() return submissions
def _load_folders(self): logger.debug("Loading folders") self._root_folders = [] url = constants.FA_ROOT + "/controls/folders/submissions/" doc = self._limited_call(self._html_get, url) # get groups for group_el in doc.cssselect(".group-row"): try: title = str(group_el.cssselect("strong")[0].text_content()) id_match = re.search("group-([0-9]+)", group_el.get("class")) id = int(id_match.group(1)) group = self._folders.get(id) if group is None: group = Folder() group._session = self group.id = id self._folders[id] = group group.title = title group.children = [] group.submissions = [] self._root_folders.append(group) except (IndexError, ValueError): raise exceptions.ScraperError() # Get folders for folder_el in doc.cssselect(".folder-row"): try: title = str(folder_el.cssselect(".folder-name strong")[ 0].text_content()) id_match = re.search("folder-([0-9]+)", folder_el.get("class")) group_match = re.search("group-([0-9]+)", folder_el.get("class")) id = int(id_match.group(1)) parent_id = int(group_match.group(1)) folder = self._folders.get(id) if folder is None: folder = Folder() folder._session = self folder.id = id self._folders[id] = folder folder.title = title folder.children = [] parent = self._folders.get(parent_id) if parent is None: self._root_folders.append(folder) else: parent.children.append(folder) except (IndexError, ValueError): raise exceptions.ScraperError()