def thread_scraper(thread_id, board_name, session, directory): thread_id = str(thread_id) link = "http://a.4cdn.org/" + board_name + "/thread/" + thread_id + ".json" r = session.get(link) if r.status_code == 404: return thread = json.loads(r.text) thread_master = thread["posts"][0] if "archived" in thread_master: location = "Archive" else: location = "Catalog" if "sub" in thread_master: title = thread_master["sub"].lower() if any(ignored_keyword in title for ignored_keyword in ignored_keywords): print("Removed From "+location+": ", title) return if "com" in thread_master: title = thread_master["com"].lower() if any(ignored_keyword in title for ignored_keyword in ignored_keywords): print("Removed From "+location+": ", title) return text = "" if "sub" in thread_master: text = thread_master["sub"][:maximum_length] else: text = thread_master["com"][:maximum_length] found = False new_directory = "" seen = set() for post in thread["posts"]: if "name" not in post: post["name"] = "Anonymous" if "filename" in post: ext = post["ext"].replace(".", "") filename = main_helper.clean_text(post["filename"]) if not filename: filename = str(post["no"]) result = main_helper.rename_duplicates(seen, filename) seen = result[0] file_name = result[1] text = main_helper.clean_text(text) new_directory = directory+"/"+text+" - "+thread_id+"/" if not text: new_directory = new_directory.replace(" - ", "") date_object = datetime.fromtimestamp(post["time"]) file_path = main_helper.reformat(new_directory, None, None, file_name, text, ext, date_object, post["name"], file_directory_format, file_name_format, date_format, maximum_length) post["download_path"] = file_path found = True if found: thread["directory"] = new_directory return thread
def thread_scraper(thread_id, board_name, session, directory): thread_id = str(thread_id) link = "https://bbw-chan.nl/" + board_name + "/res/" + thread_id + ".json" r = session.get(link) if r.status_code == 404: return thread = json.loads(r.text) thread_master = thread if "archived" in thread_master: location = "Archive" else: location = "Catalog" text = "" if thread_master["subject"]: title = thread_master["subject"].lower() if any(ignored_keyword in title for ignored_keyword in ignored_keywords): print("Removed From "+location+": ", title) return else: text = thread_master["subject"][:maximum_length] if thread_master["message"]: title = thread_master["message"].lower() if any(ignored_keyword in title for ignored_keyword in ignored_keywords): print("Removed From "+location+": ", title) return else: if not text: text = thread_master["message"][:maximum_length] thread_master2 = thread_master.copy() for key in thread_master2: if "posts" != key: del thread_master[key] del thread_master2["posts"] thread["download_path"] = "" thread["posts"] = [thread_master2]+thread_master["posts"] found = False new_directory = "" for post in thread["posts"]: date_object = datetime.strptime( post["creation"], "%Y-%m-%dT%H:%M:%S.%fZ") post["creation"] = date_object.timestamp() for media in post["files"]: ext = media["mime"].split("/")[1] media["ext"] = ext file_name = os.path.splitext(media["originalName"])[0].strip() text = main_helper.clean_text(text) new_directory = directory+"/"+text+" - "+thread_id+"/" if not text: new_directory = new_directory.replace(" - ", "") file_path = main_helper.reformat(new_directory, None, None, file_name, text, ext, date_object, post["name"], file_directory_format, file_name_format, date_format, maximum_length) media["download_path"] = file_path found = True if found: thread["directory"] = new_directory return thread
def __init__(self, option): self.directory = option.get('directory') self.post_id = option.get('post_id', "") self.media_id = option.get('media_id', "") self.filename = filename self.text = main_helper.clean_text(option.get('text', "")) self.ext = option.get('ext', ext) self.date = option.get('postedAt', today) self.username = option.get('username', username) self.format_path = format_path self.date_format = date_format self.maximum_length = int(text_length)
def media_scraper(link, session, directory, username, api_type): media_set = [[], []] media_type = directory[-1] y = json_request(session, link) if not y or "error" in y: return media_set x = 0 if api_type == "Highlights": y = y["stories"] if api_type == "Messages": y = y["list"] if api_type == "Mass Messages": y = y["list"] master_date = "01-01-0001 00:00:00" for media_api in y: if api_type == "Mass Messages": media_user = media_api["fromUser"] media_username = media_user["username"] if media_username != username: continue for media in media_api["media"]: date = "-001-11-30T00:00:00+00:00" size = 0 if "source" in media: source = media["source"] link = source["source"] size = media["info"]["preview"][ "size"] if "info" in media_api else 1 date = media_api[ "postedAt"] if "postedAt" in media_api else media_api[ "createdAt"] if "src" in media: link = media["src"] size = media["info"]["preview"][ "size"] if "info" in media_api else 1 date = media_api["createdAt"] if not link: continue matches = ["us", "uk", "ca", "ca2", "de"] url = urlparse(link) subdomain = url.hostname.split('.')[0] preview_link = media["preview"] if any(subdomain in nm for nm in matches): subdomain = url.hostname.split('.')[1] if "upload" in subdomain: continue if "convert" in subdomain: link = preview_link rules = [link == "", preview_link == ""] if all(rules): continue new_dict = dict() new_dict["post_id"] = media_api["id"] new_dict["media_id"] = media["id"] new_dict["links"] = [] for xlink in link, preview_link: if xlink: new_dict["links"].append(xlink) new_dict[ "price"] = media_api["price"] if "price" in media_api else None if date == "-001-11-30T00:00:00+00:00": date_string = master_date date_object = datetime.strptime(master_date, "%d-%m-%Y %H:%M:%S") else: date_object = datetime.fromisoformat(date) date_string = date_object.replace( tzinfo=None).strftime("%d-%m-%Y %H:%M:%S") master_date = date_string if media["type"] not in media_type: x += 1 continue if "rawText" not in media_api: media_api["rawText"] = "" text = media_api["rawText"] if media_api["rawText"] else "" matches = [s for s in ignored_keywords if s in text] if matches: print("Matches: ", matches) continue text = clean_text(text) new_dict["postedAt"] = date_string post_id = new_dict["post_id"] media_id = new_dict["media_id"] file_name = link.rsplit('/', 1)[-1] file_name, ext = os.path.splitext(file_name) ext = ext.__str__().replace(".", "").split('?')[0] file_path = reformat(directory[0][1], post_id, media_id, file_name, text, ext, date_object, username, format_path, date_format, maximum_length) new_dict["text"] = text new_dict["paid"] = False if new_dict["price"]: if api_type in ["Messages", "Mass Messages"]: new_dict["paid"] = True else: if media["id"] not in media_api["preview"] and media[ "canView"]: new_dict["paid"] = True new_dict["directory"] = os.path.join(directory[0][1]) if sort_free_paid_posts: new_dict["directory"] = os.path.join(directory[1][1]) if new_dict["paid"]: new_dict["directory"] = os.path.join(directory[2][1]) new_dict["filename"] = file_path.rsplit('/', 1)[-1] new_dict["size"] = size if size == 0: media_set[1].append(new_dict) continue media_set[0].append(new_dict) return media_set
async def reformat_2(self, unformatted: Path): post_id = self.post_id media_id = self.media_id date = self.date text = self.text value = "Free" maximum_length = self.maximum_length text_length = self.text_length post_id = "" if post_id is None else str(post_id) media_id = "" if media_id is None else str(media_id) unformatted_string = unformatted.as_posix() extra_count = 0 if type(date) is str: format_variables2 = format_attributes() if date != format_variables2.date and date != "": date = datetime.strptime(date, "%d-%m-%Y %H:%M:%S") date = date.strftime(self.date_format) else: if isinstance(date, datetime): date = date.strftime(self.date_format) has_text = False if "{text}" in unformatted_string: has_text = True text = main_helper.clean_text(text) extra_count = len("{text}") if "{value}" in unformatted_string: if self.price: if not self.preview: value = "Paid" directory = self.directory if not directory: raise Exception("Directory not found") path = unformatted_string.replace("{site_name}", self.site_name) path = path.replace("{first_letter}", self.model_username[0].capitalize()) path = path.replace("{post_id}", post_id) path = path.replace("{media_id}", media_id) path = path.replace("{profile_username}", self.profile_username) path = path.replace("{model_username}", self.model_username) path = path.replace("{api_type}", self.api_type) path = path.replace("{media_type}", self.media_type) path = path.replace("{filename}", self.filename) path = path.replace("{ext}", self.ext) path = path.replace("{value}", value) path = path.replace("{date}", date) directory_count = len(str(directory)) path_count = len(path) maximum_length = maximum_length - (directory_count + path_count - extra_count) text_length = text_length if text_length < maximum_length else maximum_length if has_text: # https://stackoverflow.com/a/43848928 def utf8_lead_byte(b: int): """A UTF-8 intermediate byte starts with the bits 10xxxxxx.""" return (b & 0xC0) != 0x80 def utf8_byte_truncate(text: str, max_bytes: int): """If text[max_bytes] is not a lead byte, back up until a lead byte is found and truncate before that character.""" utf8 = text.encode("utf8") if len(utf8) <= max_bytes: return utf8 i = max_bytes while i > 0 and not utf8_lead_byte(utf8[i]): i -= 1 return utf8[:i] filtered_text = utf8_byte_truncate(text, text_length).decode("utf8") path = path.replace("{text}", filtered_text) else: path = path.replace("{text}", "") x_path = directory.joinpath(path) return x_path
def media_scraper(result, sessions, formatted_directories, username, api_type): link = result["link"] session = sessions[result["count"]] media_set = [] y = main_helper.json_request(session, link) if not y or "error" in y: return media_set x = 0 if api_type == "Highlights": y = y["stories"] if api_type == "Messages": y = y["list"] if api_type == "Mass Messages": y = y["list"] model_directory = formatted_directories["model_directory"] for location in formatted_directories["locations"]: sorted_directories = location["sorted_directories"] master_date = "01-01-0001 00:00:00" media_type = location["media_type"] alt_media_type = location["alt_media_type"] if result["count"] == 0: seperator = " | " print("Scraping ["+str(seperator.join(alt_media_type)) + "]. Should take less than a minute.") media_set2 = {} media_set2["type"] = media_type media_set2["valid"] = [] media_set2["invalid"] = [] for media_api in y: if api_type == "Messages": media_api["rawText"] = media_api["text"] if api_type == "Mass Messages": media_user = media_api["fromUser"] media_username = media_user["username"] if media_username != username: continue for media in media_api["media"]: date = "-001-11-30T00:00:00+00:00" size = 0 if "source" in media: source = media["source"] link = source["source"] size = media["info"]["preview"]["size"] if "info" in media_api else 1 date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"] if "src" in media: link = media["src"] size = media["info"]["preview"]["size"] if "info" in media_api else 1 date = media_api["createdAt"] if not link: continue matches = ["us", "uk", "ca", "ca2", "de"] url = urlparse(link) subdomain = url.hostname.split('.')[0] preview_link = media["preview"] if any(subdomain in nm for nm in matches): subdomain = url.hostname.split('.')[1] if "upload" in subdomain: continue if "convert" in subdomain: link = preview_link rules = [link == "", preview_link == ""] if all(rules): continue new_dict = dict() new_dict["post_id"] = media_api["id"] new_dict["media_id"] = media["id"] new_dict["links"] = [] for xlink in link, preview_link: if xlink: new_dict["links"].append(xlink) break new_dict["price"] = media_api["price"]if "price" in media_api else None if date == "-001-11-30T00:00:00+00:00": date_string = master_date date_object = datetime.strptime( master_date, "%d-%m-%Y %H:%M:%S") else: date_object = datetime.fromisoformat(date) date_string = date_object.replace(tzinfo=None).strftime( "%d-%m-%Y %H:%M:%S") master_date = date_string if media["type"] not in alt_media_type: x += 1 continue if "rawText" not in media_api: media_api["rawText"] = "" text = media_api["rawText"] if media_api["rawText"] else "" matches = [s for s in ignored_keywords if s in text] if matches: print("Matches: ", matches) continue text = main_helper.clean_text(text) new_dict["postedAt"] = date_string post_id = new_dict["post_id"] media_id = new_dict["media_id"] file_name = link.rsplit('/', 1)[-1] file_name, ext = os.path.splitext(file_name) ext = ext.__str__().replace(".", "").split('?')[0] media_directory = os.path.join( model_directory, sorted_directories["unsorted"]) new_dict["paid"] = False if new_dict["price"]: if api_type in ["Messages", "Mass Messages"]: new_dict["paid"] = True else: if media["id"] not in media_api["preview"] and media["canView"]: new_dict["paid"] = True if sort_free_paid_posts: media_directory = os.path.join( model_directory, sorted_directories["free"]) if new_dict["paid"]: media_directory = os.path.join( model_directory, sorted_directories["paid"]) file_path = main_helper.reformat(media_directory, post_id, media_id, file_name, text, ext, date_object, username, file_directory_format, file_name_format, date_format, maximum_length) new_dict["text"] = text file_directory = os.path.dirname(file_path) new_dict["directory"] = os.path.join(file_directory) new_dict["filename"] = os.path.basename(file_path) new_dict["size"] = size if size == 0: media_set2["invalid"].append(new_dict) continue new_dict["session"] = session media_set2["valid"].append(new_dict) media_set.append(media_set2) return media_set