def thread_scraper(thread_id, board_name, session, directory): thread_id = str(thread_id) link = "https://bbw-chan.nl/" + board_name + "/res/" + thread_id + ".json" r = session.get(link) if r.status_code == 404: return thread = json.loads(r.text) thread_master = thread if "archived" in thread_master: location = "Archive" else: location = "Catalog" text = "" if thread_master["subject"]: title = thread_master["subject"].lower() if any(ignored_keyword in title for ignored_keyword in ignored_keywords): print("Removed From "+location+": ", title) return else: text = thread_master["subject"][:maximum_length] if thread_master["message"]: title = thread_master["message"].lower() if any(ignored_keyword in title for ignored_keyword in ignored_keywords): print("Removed From "+location+": ", title) return else: if not text: text = thread_master["message"][:maximum_length] thread_master2 = thread_master.copy() for key in thread_master2: if "posts" != key: del thread_master[key] del thread_master2["posts"] thread["download_path"] = "" thread["posts"] = [thread_master2]+thread_master["posts"] found = False new_directory = "" for post in thread["posts"]: date_object = datetime.strptime( post["creation"], "%Y-%m-%dT%H:%M:%S.%fZ") post["creation"] = date_object.timestamp() for media in post["files"]: ext = media["mime"].split("/")[1] media["ext"] = ext file_name = os.path.splitext(media["originalName"])[0].strip() text = main_helper.clean_text(text) new_directory = directory+"/"+text+" - "+thread_id+"/" if not text: new_directory = new_directory.replace(" - ", "") file_path = main_helper.reformat(new_directory, None, None, file_name, text, ext, date_object, post["name"], file_directory_format, file_name_format, date_format, maximum_length) media["download_path"] = file_path found = True if found: thread["directory"] = new_directory return thread
def update(filepath): temp = json.loads( json.dumps(reformat, default=lambda o: o.__dict__)) filepath = os.path.abspath(filepath) new_format = main_helper.reformat(**temp) new_format = os.path.abspath(new_format) if filepath != new_format: shutil.move(filepath, new_format) return new_format
def thread_scraper(thread_id, board_name, session, directory): thread_id = str(thread_id) link = "http://a.4cdn.org/" + board_name + "/thread/" + thread_id + ".json" r = session.get(link) if r.status_code == 404: return thread = json.loads(r.text) thread_master = thread["posts"][0] if "archived" in thread_master: location = "Archive" else: location = "Catalog" if "sub" in thread_master: title = thread_master["sub"].lower() if any(ignored_keyword in title for ignored_keyword in ignored_keywords): print("Removed From "+location+": ", title) return if "com" in thread_master: title = thread_master["com"].lower() if any(ignored_keyword in title for ignored_keyword in ignored_keywords): print("Removed From "+location+": ", title) return text = "" if "sub" in thread_master: text = thread_master["sub"][:maximum_length] else: text = thread_master["com"][:maximum_length] found = False new_directory = "" seen = set() for post in thread["posts"]: if "name" not in post: post["name"] = "Anonymous" if "filename" in post: ext = post["ext"].replace(".", "") filename = main_helper.clean_text(post["filename"]) if not filename: filename = str(post["no"]) result = main_helper.rename_duplicates(seen, filename) seen = result[0] file_name = result[1] text = main_helper.clean_text(text) new_directory = directory+"/"+text+" - "+thread_id+"/" if not text: new_directory = new_directory.replace(" - ", "") date_object = datetime.fromtimestamp(post["time"]) file_path = main_helper.reformat(new_directory, None, None, file_name, text, ext, date_object, post["name"], file_directory_format, file_name_format, date_format, maximum_length) post["download_path"] = file_path found = True if found: thread["directory"] = new_directory return thread
def reformat(self, unformatted_list) -> list[str]: x = [] format_variables2 = format_variables() for key, unformatted_item in unformatted_list.items(): if "filename_format" == key: unformatted_item = os.path.join(x[1], unformatted_item) print string = main_helper.reformat(self, unformatted_item) final_path = [] paths = string.split(os.sep) for path in paths: key = main_helper.find_between(path, "{", "}") e = getattr(format_variables2, key, None) if path == e: break final_path.append(path) final_path = os.sep.join(final_path) print x.append(final_path) return x
def format_directories(directory, site_name, username, unformatted, locations=[], api_type="") -> dict: x = {} option = {} option["site_name"] = site_name option["username"] = username option["directory"] = directory option["postedAt"] = datetime.today() option["date_format"] = date_format option["maximum_length"] = maximum_length prepared_format = prepare_reformat(option) legacy_model_directory = x["legacy_model_directory"] = os.path.join( directory, site_name, username) x["legacy_metadatas"] = {} x["legacy_metadatas"]["legacy_metadata"] = os.path.join( legacy_model_directory, api_type, "Metadata") x["legacy_metadatas"]["legacy_metadata2"] = os.path.join( legacy_model_directory, "Metadata") x["metadata_directory"] = main_helper.reformat(prepared_format, unformatted) x["download_directory"] = directory x["locations"] = [] for location in locations: directories = {} cats = ["Unsorted", "Free", "Paid"] for cat in cats: cat2 = cat if "Unsorted" in cat2: cat2 = "" path = os.path.join(api_type, cat2, location[0]) directories[cat.lower()] = path y = {} y["sorted_directories"] = directories y["media_type"] = location[0] y["alt_media_type"] = location[1] x["locations"].append(y) return x
def media_scraper(results, api, formatted_directories, username, api_type, parent_type=""): new_set = {} new_set["content"] = [] directories = [] session = api.sessions[0] if api_type == "Stories": if "stories" in results: items = results["stories"] for item in items: item["text"] = results["title"] results = results["stories"] if api_type == "Archived": print pass if api_type == "Posts": print if api_type == "Messages": pass if not results or "error" in results: return new_set if "result" in results: session = results["session"] results = results["result"] if "error" in results: return new_set download_path = formatted_directories["download_directory"] for location in formatted_directories["locations"]: sorted_directories = copy.copy(location["sorted_directories"]) master_date = "01-01-0001 00:00:00" media_type = location["media_type"] alt_media_type = location["alt_media_type"] file_directory_format = json_settings["file_directory_format"] if api_type == "Archived": x = file_directory_format.split(os.sep) for y in x: substr = "{api_type}" if substr == y: new_path = os.path.join(substr, parent_type) file_directory_format = file_directory_format.replace( substr, new_path) break print print seperator = " | " print( f"Scraping [{seperator.join(alt_media_type)}]. Should take less than a minute.") for media_api in results: new_post = {} new_post["medias"] = [] rawText = media_api.get("rawText", "") text = media_api.get("text", "") final_text = rawText if rawText else text # if media_api["responseType"] == "post": # if media_api["isArchived"]: # pass if api_type == "Messages": media_api["rawText"] = media_api["text"] if api_type == "Mass Messages": media_user = media_api["fromUser"] media_username = media_user["username"] if media_username != username: continue date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"] if date == "-001-11-30T00:00:00+00:00": date_string = master_date date_object = datetime.strptime( master_date, "%d-%m-%Y %H:%M:%S") else: date_object = datetime.fromisoformat(date) date_string = date_object.replace(tzinfo=None).strftime( "%d-%m-%Y %H:%M:%S") master_date = date_string new_post["post_id"] = media_api["id"] new_post["text"] = final_text new_post["postedAt"] = date_string new_post["paid"] = False price = new_post["price"] = media_api["price"]if "price" in media_api else None if price == None: price = 0 canPurchase = media_api.get("canPurchase", None) canViewMedia = media_api.get("canViewMedia", None) if price: if not canPurchase or canViewMedia: new_post["paid"] = True for media in media_api["media"]: media_id = media["id"] date = "-001-11-30T00:00:00+00:00" size = 0 link = "" preview_link = "" if "source" in media: quality_key = "source" source = media[quality_key] link = source[quality_key] if link: if media["type"] == "video": qualities = media["videoSources"] qualities = dict( sorted(qualities.items(), reverse=False)) qualities[quality_key] = source[quality_key] for quality, quality_link in qualities.items(): video_quality_json = json_settings["video_quality"] video_quality_json = video_quality_json.removesuffix( "p") if quality == video_quality_json: if link: link = quality_link break print print print size = media["info"]["preview"]["size"] if "info" in media_api else 1 if "src" in media: link = media["src"] size = media["info"]["preview"]["size"] if "info" in media_api else 1 date = media_api["createdAt"] matches = ["us", "uk", "ca", "ca2", "de"] if not link: continue url = urlparse(link) subdomain = url.hostname.split('.')[0] preview_link = media["preview"] if any(subdomain in nm for nm in matches): subdomain = url.hostname.split('.')[1] if "upload" in subdomain: continue if "convert" in subdomain: link = preview_link rules = [link == "", preview_link == ""] if all(rules): continue new_media = dict() new_media["media_id"] = media_id new_media["links"] = [] new_media["media_type"] = media_type for xlink in link, preview_link: if xlink: new_media["links"].append(xlink) break session.links.extend(new_media["links"]) if media["type"] not in alt_media_type: continue matches = [s for s in ignored_keywords if s in final_text] if matches: print("Matches: ", matches) continue filename = link.rsplit('/', 1)[-1] filename, ext = os.path.splitext(filename) ext = ext.__str__().replace(".", "").split('?')[0] option = {} option = option | new_post option["site_name"] = "OnlyFans" option["media_id"] = media_id option["filename"] = filename option["api_type"] = api_type option["media_type"] = media_type option["ext"] = ext option["username"] = username option["date_format"] = date_format option["text_length"] = text_length option["directory"] = download_path prepared_format = prepare_reformat(option) file_directory = main_helper.reformat( prepared_format, file_directory_format) prepared_format.directory = file_directory file_path = main_helper.reformat( prepared_format, filename_format) new_media["directory"] = os.path.join(file_directory) new_media["filename"] = os.path.basename(file_path) if file_directory not in directories: directories.append(file_directory) new_post["medias"].append(new_media) new_set["content"].append(new_post) new_set["directories"] = directories return new_set
def media_scraper(link, session, directory, username, api_type): media_set = [[], []] media_type = directory[-1] y = json_request(session, link) if not y or "error" in y: return media_set x = 0 if api_type == "Highlights": y = y["stories"] if api_type == "Messages": y = y["list"] if api_type == "Mass Messages": y = y["list"] master_date = "01-01-0001 00:00:00" for media_api in y: if api_type == "Mass Messages": media_user = media_api["fromUser"] media_username = media_user["username"] if media_username != username: continue for media in media_api["media"]: date = "-001-11-30T00:00:00+00:00" size = 0 if "source" in media: source = media["source"] link = source["source"] size = media["info"]["preview"][ "size"] if "info" in media_api else 1 date = media_api[ "postedAt"] if "postedAt" in media_api else media_api[ "createdAt"] if "src" in media: link = media["src"] size = media["info"]["preview"][ "size"] if "info" in media_api else 1 date = media_api["createdAt"] if not link: continue matches = ["us", "uk", "ca", "ca2", "de"] url = urlparse(link) subdomain = url.hostname.split('.')[0] preview_link = media["preview"] if any(subdomain in nm for nm in matches): subdomain = url.hostname.split('.')[1] if "upload" in subdomain: continue if "convert" in subdomain: link = preview_link rules = [link == "", preview_link == ""] if all(rules): continue new_dict = dict() new_dict["post_id"] = media_api["id"] new_dict["media_id"] = media["id"] new_dict["links"] = [] for xlink in link, preview_link: if xlink: new_dict["links"].append(xlink) new_dict[ "price"] = media_api["price"] if "price" in media_api else None if date == "-001-11-30T00:00:00+00:00": date_string = master_date date_object = datetime.strptime(master_date, "%d-%m-%Y %H:%M:%S") else: date_object = datetime.fromisoformat(date) date_string = date_object.replace( tzinfo=None).strftime("%d-%m-%Y %H:%M:%S") master_date = date_string if media["type"] not in media_type: x += 1 continue if "rawText" not in media_api: media_api["rawText"] = "" text = media_api["rawText"] if media_api["rawText"] else "" matches = [s for s in ignored_keywords if s in text] if matches: print("Matches: ", matches) continue text = clean_text(text) new_dict["postedAt"] = date_string post_id = new_dict["post_id"] media_id = new_dict["media_id"] file_name = link.rsplit('/', 1)[-1] file_name, ext = os.path.splitext(file_name) ext = ext.__str__().replace(".", "").split('?')[0] file_path = reformat(directory[0][1], post_id, media_id, file_name, text, ext, date_object, username, format_path, date_format, maximum_length) new_dict["text"] = text new_dict["paid"] = False if new_dict["price"]: if api_type in ["Messages", "Mass Messages"]: new_dict["paid"] = True else: if media["id"] not in media_api["preview"] and media[ "canView"]: new_dict["paid"] = True new_dict["directory"] = os.path.join(directory[0][1]) if sort_free_paid_posts: new_dict["directory"] = os.path.join(directory[1][1]) if new_dict["paid"]: new_dict["directory"] = os.path.join(directory[2][1]) new_dict["filename"] = file_path.rsplit('/', 1)[-1] new_dict["size"] = size if size == 0: media_set[1].append(new_dict) continue media_set[0].append(new_dict) return media_set
def media_scraper(result, sessions, formatted_directories, username, api_type): link = result["link"] session = sessions[result["count"]] media_set = [] y = main_helper.json_request(session, link) if not y or "error" in y: return media_set x = 0 if api_type == "Highlights": y = y["stories"] if api_type == "Messages": y = y["list"] y = y["list"] if "list" in y else y model_directory = formatted_directories["model_directory"] for location in formatted_directories["locations"]: sorted_directories = location["sorted_directories"] master_date = "01-01-0001 00:00:00" media_type = location["media_type"] alt_media_type = location["alt_media_type"] if result["count"] == 0: seperator = " | " print("Scraping [" + str(seperator.join(alt_media_type)) + "]. Should take less than a minute.") media_set2 = {} media_set2["type"] = media_type media_set2["valid"] = [] media_set2["invalid"] = [] for media_api in y: if api_type == "Mass Messages": media_user = media_api["fromUser"] media_username = media_user["username"] if media_username != username: continue new_api = (media_api["media"] if "media" in media_api else [media_api]) for media in new_api: date = "-001-11-30T00:00:00+00:00" size = 1 src = media["src"] link = src["source"] date = media_api[ "createdAt"] if "createdAt" in media_api else media_api[ "postedAt"] if not link: continue new_dict = dict() new_dict["post_id"] = media_api["id"] new_dict["links"] = [link] if date == "-001-11-30T00:00:00+00:00": date_string = master_date date_object = datetime.strptime(master_date, "%d-%m-%Y %H:%M:%S") else: date_object = datetime.fromisoformat(date) date_string = date_object.replace( tzinfo=None).strftime("%d-%m-%Y %H:%M:%S") master_date = date_string media["mediaType"] = media[ "mediaType"] if "mediaType" in media else media["type"] if media["mediaType"] not in alt_media_type: x += 1 continue if "text" not in media_api: media_api["text"] = "" new_dict[ "text"] = media_api["text"] if media_api["text"] else "" new_dict["postedAt"] = date_string post_id = new_dict["post_id"] media_id = media["id"] if "id" in media else None media_id = media_id if isinstance(media_id, int) else None text = new_dict["text"] file_name = link.rsplit('/', 1)[-1] file_name, ext = os.path.splitext(file_name) ext = ext.__str__().replace(".", "").split('?')[0] media_directory = os.path.join(model_directory, sorted_directories["unsorted"]) file_path = main_helper.reformat(media_directory, post_id, media_id, file_name, text, ext, date_object, username, file_directory_format, file_name_format, date_format, maximum_length) file_directory = os.path.dirname(file_path) new_dict["directory"] = os.path.join(file_directory) new_dict["filename"] = os.path.basename(file_path) new_dict["size"] = size if size == 0: media_set2["invalid"].append(new_dict) continue new_dict["session"] = session media_set2["valid"].append(new_dict) media_set.append(media_set2) return media_set
def media_scraper(results, api, formatted_directories, username, api_type, parent_type=""): media_set = {} directories = [] session = api.sessions[0] if api_type == "Stories": if "stories" in results: items = results["stories"] for item in items: item["text"] = results["title"] results = results["stories"] if api_type == "Archived": print pass if api_type == "Posts": print if api_type == "Messages": pass if not results or "error" in results: return media_set if "result" in results: session = results["session"] results = results["result"] if "error" in results: return media_set download_path = formatted_directories["download_directory"] for location in formatted_directories["locations"]: sorted_directories = copy.copy(location["sorted_directories"]) master_date = "01-01-0001 00:00:00" media_type = location["media_type"] alt_media_type = location["alt_media_type"] file_directory_format = json_settings["file_directory_format"] if api_type == "Archived": x = file_directory_format.split(os.sep) for y in x: substr = "{api_type}" if substr == y: new_path = os.path.join(substr, parent_type) file_directory_format = file_directory_format.replace( substr, new_path) break print print seperator = " | " print( f"Scraping [{seperator.join(alt_media_type)}]. Should take less than a minute.") media_set2 = {} media_set2["valid"] = [] media_set2["invalid"] = [] for media_api in results: # if media_api["responseType"] == "post": # if media_api["isArchived"]: # pass if api_type == "Messages": media_api["rawText"] = media_api["text"] if api_type == "Mass Messages": media_user = media_api["fromUser"] media_username = media_user["username"] if media_username != username: continue date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"] if date == "-001-11-30T00:00:00+00:00": date_string = master_date date_object = datetime.strptime( master_date, "%d-%m-%Y %H:%M:%S") else: date_object = datetime.fromisoformat(date) date_string = date_object.replace(tzinfo=None).strftime( "%d-%m-%Y %H:%M:%S") master_date = date_string if not media_api["media"] and "rawText" in media_api: if media_type == "Texts": new_dict = dict() new_dict["post_id"] = media_api["id"] new_dict["text"] = media_api["rawText"] new_dict["postedAt"] = date_string media_set2["valid"].append(new_dict) print print for media in media_api["media"]: date = "-001-11-30T00:00:00+00:00" size = 0 link = "" if "source" in media: source = media["source"] link = source["source"] size = media["info"]["preview"]["size"] if "info" in media_api else 1 if "src" in media: link = media["src"] size = media["info"]["preview"]["size"] if "info" in media_api else 1 date = media_api["createdAt"] if not link: continue matches = ["us", "uk", "ca", "ca2", "de"] url = urlparse(link) subdomain = url.hostname.split('.')[0] preview_link = media["preview"] if any(subdomain in nm for nm in matches): subdomain = url.hostname.split('.')[1] if "upload" in subdomain: continue if "convert" in subdomain: link = preview_link rules = [link == "", preview_link == ""] if all(rules): continue new_dict = dict() new_dict["post_id"] = media_api["id"] new_dict["media_id"] = media["id"] new_dict["links"] = [] for xlink in link, preview_link: if xlink: new_dict["links"].append(xlink) break new_dict["price"] = media_api["price"]if "price" in media_api else None if media["type"] not in alt_media_type: continue if "rawText" not in media_api: media_api["rawText"] = "" text = media_api["rawText"] if media_api["rawText"] else "" matches = [s for s in ignored_keywords if s in text] if matches: print("Matches: ", matches) continue new_dict["postedAt"] = date_string post_id = new_dict["post_id"] media_id = new_dict["media_id"] filename = link.rsplit('/', 1)[-1] filename, ext = os.path.splitext(filename) ext = ext.__str__().replace(".", "").split('?')[0] price = new_dict["price"] new_dict["text"] = text option = {} option = option | new_dict option["site_name"] = "OnlyFans" option["filename"] = filename option["api_type"] = api_type option["media_type"] = media_type option["ext"] = ext option["username"] = username option["date_format"] = date_format option["maximum_length"] = maximum_length option["directory"] = download_path prepared_format = prepare_reformat(option) file_directory = main_helper.reformat( prepared_format, file_directory_format) prepared_format.directory = file_directory file_path = main_helper.reformat( prepared_format, filename_format) new_dict["directory"] = os.path.join(file_directory) new_dict["filename"] = os.path.basename(file_path) new_dict["session"] = session if size == 0: media_set2["invalid"].append(new_dict) continue if file_directory not in directories: directories.append(file_directory) media_set2["valid"].append(new_dict) if media_set2["valid"] or media_set2["invalid"]: media_set[media_type] = media_set2 else: print media_set["directories"] = directories return media_set
def media_scraper(link, session, directory, username, api_type): media_set = [[], []] media_type = directory[-1] count = 0 found = False y = json_request(session, link) if "error" in y: return media_set x = 0 if api_type == "Highlights": y = y["stories"] y = y["list"] if "list" in y else y master_date = "01-01-0001 00:00:00" for media_api in y: if api_type == "Mass Messages": media_user = media_api["fromUser"] media_username = media_user["username"] if media_username != username: continue new_api = (media_api["media"] if "media" in media_api else [media_api]) for media in new_api: date = "-001-11-30T00:00:00+00:00" size = 1 src = media["src"] link = src["source"] date = media_api[ "createdAt"] if "createdAt" in media_api else media_api[ "postedAt"] if not link: continue new_dict = dict() new_dict["post_id"] = media_api["id"] new_dict["links"] = [link] if date == "-001-11-30T00:00:00+00:00": date_string = master_date date_object = datetime.strptime(master_date, "%d-%m-%Y %H:%M:%S") else: date_object = datetime.fromisoformat(date) date_string = date_object.replace( tzinfo=None).strftime("%d-%m-%Y %H:%M:%S") master_date = date_string media["mediaType"] = media[ "mediaType"] if "mediaType" in media else media["type"] if media["mediaType"] not in media_type: x += 1 continue if "text" not in media_api: media_api["text"] = "" new_dict["text"] = media_api["text"] if media_api["text"] else "" new_dict["postedAt"] = date_string post_id = new_dict["post_id"] media_id = media["id"] if "id" in media else None media_id = media_id if isinstance(media_id, int) else None text = new_dict["text"] file_name = link.rsplit('/', 1)[-1] file_name, ext = os.path.splitext(file_name) ext = ext.__str__().replace(".", "").split('?')[0] file_path = reformat(directory[0][1], post_id, media_id, file_name, text, ext, date_object, username, format_path, date_format, maximum_length) new_dict["directory"] = directory[0][1] new_dict["filename"] = file_path.rsplit('/', 1)[-1] new_dict["size"] = size if size == 0: media_set[1].append(new_dict) continue media_set[0].append(new_dict) return media_set
folders = os.listdir(metadata_directory) for metadata_file in folders: metadata_filepath = os.path.join(metadata_directory, metadata_file) metadatas = json.load(open(metadata_filepath))[0]["valid"] for metadata in metadatas: for model in metadata: model_folder = model["directory"] filename = model["filename"] post_id = str(model["post_id"]) filepath = os.path.join(model_folder, filename) class prepare_reformat(object): def __init__(self, option): self.directory = option.get('directory', "") self.post_id = option.get('post_id', "") self.media_id = option.get('media_id', "") filename, ext = os.path.splitext(filepath) self.filename = option.get('filename', "") self.username = option.get('username', username) self.text = option.get('text', "") self.postedAt = option.get('postedAt', "") print x = prepare_reformat(model) x = json.loads(json.dumps(x, default=lambda o: o.__dict__)) if os.path.isfile(filepath): new_format = main_helper.reformat(*x) print else: print
def media_scraper(result, sessions, formatted_directories, username, api_type): link = result["link"] session = sessions[result["count"]] media_set = [] y = main_helper.json_request(session, link) if not y or "error" in y: return media_set x = 0 if api_type == "Highlights": y = y["stories"] if api_type == "Messages": y = y["list"] if api_type == "Mass Messages": y = y["list"] model_directory = formatted_directories["model_directory"] for location in formatted_directories["locations"]: sorted_directories = location["sorted_directories"] master_date = "01-01-0001 00:00:00" media_type = location["media_type"] alt_media_type = location["alt_media_type"] if result["count"] == 0: seperator = " | " print("Scraping ["+str(seperator.join(alt_media_type)) + "]. Should take less than a minute.") media_set2 = {} media_set2["type"] = media_type media_set2["valid"] = [] media_set2["invalid"] = [] for media_api in y: if api_type == "Messages": media_api["rawText"] = media_api["text"] if api_type == "Mass Messages": media_user = media_api["fromUser"] media_username = media_user["username"] if media_username != username: continue for media in media_api["media"]: date = "-001-11-30T00:00:00+00:00" size = 0 if "source" in media: source = media["source"] link = source["source"] size = media["info"]["preview"]["size"] if "info" in media_api else 1 date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"] if "src" in media: link = media["src"] size = media["info"]["preview"]["size"] if "info" in media_api else 1 date = media_api["createdAt"] if not link: continue matches = ["us", "uk", "ca", "ca2", "de"] url = urlparse(link) subdomain = url.hostname.split('.')[0] preview_link = media["preview"] if any(subdomain in nm for nm in matches): subdomain = url.hostname.split('.')[1] if "upload" in subdomain: continue if "convert" in subdomain: link = preview_link rules = [link == "", preview_link == ""] if all(rules): continue new_dict = dict() new_dict["post_id"] = media_api["id"] new_dict["media_id"] = media["id"] new_dict["links"] = [] for xlink in link, preview_link: if xlink: new_dict["links"].append(xlink) break new_dict["price"] = media_api["price"]if "price" in media_api else None if date == "-001-11-30T00:00:00+00:00": date_string = master_date date_object = datetime.strptime( master_date, "%d-%m-%Y %H:%M:%S") else: date_object = datetime.fromisoformat(date) date_string = date_object.replace(tzinfo=None).strftime( "%d-%m-%Y %H:%M:%S") master_date = date_string if media["type"] not in alt_media_type: x += 1 continue if "rawText" not in media_api: media_api["rawText"] = "" text = media_api["rawText"] if media_api["rawText"] else "" matches = [s for s in ignored_keywords if s in text] if matches: print("Matches: ", matches) continue text = main_helper.clean_text(text) new_dict["postedAt"] = date_string post_id = new_dict["post_id"] media_id = new_dict["media_id"] file_name = link.rsplit('/', 1)[-1] file_name, ext = os.path.splitext(file_name) ext = ext.__str__().replace(".", "").split('?')[0] media_directory = os.path.join( model_directory, sorted_directories["unsorted"]) new_dict["paid"] = False if new_dict["price"]: if api_type in ["Messages", "Mass Messages"]: new_dict["paid"] = True else: if media["id"] not in media_api["preview"] and media["canView"]: new_dict["paid"] = True if sort_free_paid_posts: media_directory = os.path.join( model_directory, sorted_directories["free"]) if new_dict["paid"]: media_directory = os.path.join( model_directory, sorted_directories["paid"]) file_path = main_helper.reformat(media_directory, post_id, media_id, file_name, text, ext, date_object, username, file_directory_format, file_name_format, date_format, maximum_length) new_dict["text"] = text file_directory = os.path.dirname(file_path) new_dict["directory"] = os.path.join(file_directory) new_dict["filename"] = os.path.basename(file_path) new_dict["size"] = size if size == 0: media_set2["invalid"].append(new_dict) continue new_dict["session"] = session media_set2["valid"].append(new_dict) media_set.append(media_set2) return media_set