def __init__(self, api=None, subscription=None) -> None: if api: username = subscription.username download_info = subscription.download_info if download_info: self.downloaded = True metadata_locations = download_info["metadata_locations"] directory = download_info["directory"] for api_type, value in subscription.scraped: if not value or not isinstance(value, create_metadata): continue for location, v in value.content: if location == "Texts": continue media_set = v.valid string = "Download Processing\n" string += f"Name: {username} | Type: {api_type} | Count: {len(media_set)} {location} | Directory: {directory}\n" print(string) pool = multiprocessing() pool.starmap(self.download, product( media_set, [api])) metadata_path = metadata_locations.get(api_type) if metadata_path: value = value.export() if export_metadata: export_archive(value, metadata_path, json_settings) else: print else: self.downloaded = False
def paid_content_scraper(apis: list[start]): for api in apis: paid_contents = [] paid_contents = api.get_paid_content() authed = api.auth authed.subscriptions = authed.subscriptions for paid_content in paid_contents: author = paid_content.get("author") author = paid_content.get("fromUser", author) subscription = api.get_subscription( check=True, identifier=author["id"]) if not subscription: subscription = create_subscription(author) authed.subscriptions.append(subscription) if paid_content["responseType"] == "post": if paid_content["isArchived"]: print(f"Model: {author['username']}") # print( # "ERROR, PLEASE REPORT THIS AS AN ISSUE AND TELL ME WHICH MODEL YOU'RE SCRAPIMG, THANKS") # input() # exit() api_type = paid_content["responseType"].capitalize()+"s" api_media = getattr(subscription.scraped, api_type) api_media.append(paid_content) print count = 0 max_count = len(authed.subscriptions) for subscription in authed.subscriptions: string = f"Scraping - {subscription.username} | {count} / {max_count}" print(string) subscription.sessions = api.sessions username = subscription.username site_name = "OnlyFans" media_type = format_media_types() count += 1 for api_type, paid_content in subscription.scraped: if api_type == "Archived": continue formatted_directories = format_directories( j_directory, site_name, username, metadata_directory_format, media_type, api_type) metadata_directory = formatted_directories["metadata_directory"] metadata_path = os.path.join( metadata_directory, api_type+".json") new_metadata = media_scraper(paid_content, api, formatted_directories, username, api_type) if new_metadata: api_path = os.path.join(api_type, "") new_metadata_object = process_metadata( api, new_metadata, formatted_directories, subscription, api_type, api_path, metadata_path, site_name) new_metadata_set = new_metadata_object.convert() if export_metadata: export_archive(new_metadata_set, metadata_path, json_settings)
def start_datascraper(api, identifier, site_name, choice_type=None): print("Scrape Processing") subscription = api.get_subscription(identifier) if not subscription: return [False, subscription] post_count = subscription.postsCount user_id = str(subscription.id) avatar = subscription.avatar username = subscription.username link = subscription.link formatted_directories = main_helper.format_directories( j_directory, site_name, username) metadata_directory = formatted_directories["metadata_directory"] archive_path = os.path.join(metadata_directory, "Mass Messages.json") if subscription.is_me: imported = import_archive(archive_path) mass_messages = api.get_mass_messages(resume=imported) export_archive(mass_messages, archive_path, json_settings, rename=False) info = {} info["download"] = prepare_download.start( username=username, link=link, image_url=avatar, post_count=post_count, webhook=webhook) print("Name: "+username) api_array = scrape_choice(api, subscription) api_array = format_options(api_array, "apis") apis = api_array[0] api_string = api_array[1] if not json_settings["auto_scrape_apis"]: print("Apis: "+api_string) value = int(input().strip()) else: value = 0 if value: apis = [apis[value]] else: apis.pop(0) for item in apis: print("Type: "+item["api_type"]) only_links = item["api_array"]["only_links"] post_count = str(item["api_array"]["post_count"]) item["api_array"]["username"] = username item["api_array"]["subscription"] = subscription api_type = item["api_type"] results = prepare_scraper( api, site_name, item) print if any(x for x in subscription.scraped): subscription.download_info["directory"] = j_directory subscription.download_info["model_directory"] = os.path.join( j_directory, username) subscription.download_info["webhook"] = webhook print("Scrape Completed"+"\n") return [True, info]
def paid_content_scraper(apis: list[start]): for api in apis: paid_contents = api.get_paid_content() authed = api.auth authed.subscriptions = authed.subscriptions for paid_content in paid_contents: author = paid_content.get("author") author = paid_content.get("fromUser", author) subscription = api.get_subscription(check=True, identifier=author["id"]) if not subscription: subscription = create_subscription(author) authed.subscriptions.append(subscription) api_type = paid_content["responseType"].capitalize() + "s" api_media = getattr(subscription.scraped, api_type) api_media.append(paid_content) print count = 0 max_count = len(authed.subscriptions) for subscription in authed.subscriptions: string = f"Scraping - {subscription.username} | {count} / {max_count}" print(string) subscription.sessions = api.sessions username = subscription.username site_name = "OnlyFans" media_type = format_media_types() count += 1 for api_type, paid_content in subscription.scraped: formatted_directories = format_directories( j_directory, site_name, username, metadata_directory_format, media_type, api_type) metadata_directory = formatted_directories[ "metadata_directory"] metadata_path = os.path.join(metadata_directory, api_type + ".json") new_metadata = media_scraper(paid_content, api, formatted_directories, username, api_type) if new_metadata: api_path = os.path.join(api_type, "") new_metadata_object = process_metadata( api, new_metadata, formatted_directories, subscription, api_type, api_path, metadata_path, site_name) new_metadata_set = new_metadata_object.convert() if export_metadata: export_archive(new_metadata_set, metadata_path, json_settings)
def account_setup(api): status = False auth = api.login() if auth: profile_directory = json_global_settings["profile_directories"][0] profile_directory = os.path.abspath(profile_directory) profile_directory = os.path.join(profile_directory, auth["username"]) profile_metadata_directory = os.path.join(profile_directory, "Metadata") metadata_filepath = os.path.join(profile_metadata_directory, "Mass Messages.json") print if auth["isPerformer"]: imported = import_archive(metadata_filepath) mass_messages = api.get_mass_messages(resume=imported) export_archive(mass_messages, metadata_filepath, json_settings) # chats = api.get_chats() subscriptions = api.get_subscriptions() status = True return status
def account_setup(api: start, identifier=""): status = False authed = api.login() if isinstance(authed, create_auth): jobs = json_settings["jobs"] profile_directory = json_global_settings["profile_directories"][0] profile_directory = os.path.abspath(profile_directory) profile_directory = os.path.join(profile_directory, authed.username) profile_metadata_directory = os.path.join( profile_directory, "Metadata") metadata_filepath = os.path.join( profile_metadata_directory, "Mass Messages.json") print if authed.isPerformer: imported = import_archive(metadata_filepath) mass_messages = api.get_mass_messages(resume=imported) export_archive(mass_messages, metadata_filepath, json_settings) # chats = api.get_chats() if not identifier and jobs["scrape_names"]: # metadata_filepath = os.path.join( # profile_metadata_directory, "Subscriptions.json") # imported = import_archive(metadata_filepath) subscriptions = api.get_subscriptions() # collection = [] # for subscription in subscriptions: # delattr(subscription,"download_info") # delattr(subscription,"sessions") # delattr(subscription,"scraped") # delattr(subscription,"is_me") # delattr(subscription,"links") # collection.append(subscription) # collection = jsonpickle.encode( # collection, unpicklable=False) # collection = jsonpickle.decode(collection) # export_archive(collection, metadata_filepath, # json_settings) status = True return status
def paid_content_scraper(api): paid_contents = api.get_paid_content(refresh=False) results = [] for paid_content in paid_contents: metadata_locations = {} author = paid_content.get("author") author = paid_content.get("fromUser", author) subscription = create_subscription(author) subscription.sessions = api.sessions subscription.download_info["directory"] = j_directory username = subscription.username model_directory = os.path.join(j_directory, username) api_type = paid_content["responseType"].capitalize() + "s" subscription.download_info["metadata_locations"] = j_directory subscription.download_info["metadata_locations"] = metadata_locations site_name = "OnlyFans" media_type = format_media_types() formatted_directories = format_directories(j_directory, site_name, username, metadata_directory_format, media_type, api_type) metadata_directory = formatted_directories["metadata_directory"] metadata_path = os.path.join(metadata_directory, api_type + ".json") metadata_locations[api_type] = metadata_path new_metadata = media_scraper([paid_content], api, formatted_directories, username, api_type) for directory in new_metadata["directories"]: os.makedirs(directory, exist_ok=True) api_path = os.path.join(api_type, "") new_metadata_object = process_metadata(api, new_metadata, formatted_directories, subscription, api_type, api_path, metadata_path, site_name) new_metadata_set = new_metadata_object.convert() if export_metadata: export_archive(new_metadata_set, metadata_path, json_settings) download_media(api, subscription) return results
def paid_content_scraper(api): paid_contents = api.get_paid_content(refresh=False) results = [] for paid_content in paid_contents: author = paid_content.get("author") author = paid_content.get("fromUser", author) subscription = create_subscription(author) subscription.sessions = api.sessions subscription.download_info["directory"] = j_directory username = subscription.username model_directory = os.path.join(j_directory, username) metadata_folder = os.path.join(model_directory, "Metadata") api_type = paid_content["responseType"].capitalize()+"s" metadata_path = os.path.join( metadata_folder, api_type+".json") site_name = "OnlyFans" media_type = format_media_types() formatted_directories = main_helper.format_directories( j_directory, site_name, username, media_type, api_type) new_item = media_scraper([paid_content], api, formatted_directories, username, api_type) for directory in new_item["directories"]: os.makedirs(directory, exist_ok=True) download_metadata = prepare_metadata(new_item).metadata subscription.set_scraped(api_type, download_metadata) metadata = prepare_metadata(new_item, export=True).metadata metadata = jsonpickle.encode( metadata, unpicklable=False) new_metadata = jsonpickle.decode(metadata) old_metadata = import_archive(metadata_path) if old_metadata: old_metadata = metadata_fixer(directory=metadata_path.replace( ".json", ""), metadata_types=old_metadata) unrefined = compare_metadata( new_metadata, old_metadata, new_chain=True) unrefined = prepare_metadata(unrefined, export=True).metadata new_metadata = jsonpickle.encode( unrefined, unpicklable=False) new_metadata = jsonpickle.decode(new_metadata) results.append(new_metadata) os.makedirs(model_directory, exist_ok=True) a = export_archive(new_metadata, metadata_path, json_settings) x = download_media(api, subscription) return results
def prepare_scraper(session, site_name, only_links, link, locations, directory, api_count, username, api_type, app_token): seperator = " | " user_directory = "" metadata_directory = "" master_set = [] media_set = [] metadata_set = [] original_link = link for location in locations: pool = ThreadPool() link = original_link print("Scraping [" + str(seperator.join(location[1])) + "]. Should take less than a minute.") array = format_directory(j_directory, site_name, username, location[0], api_type) user_directory = array[0] location_directory = array[2][0][1] metadata_directory = array[1] directories = array[2] + [location[1]] if not master_set: if api_type == "Posts": num = 100 link = link.replace("limit=0", "limit=" + str(num)) original_link = link ceil = math.ceil(api_count / num) a = list(range(ceil)) for b in a: b = b * num master_set.append( link.replace("offset=0", "offset=" + str(b))) if api_type == "Archived": ceil = math.ceil(api_count / 100) a = list(range(ceil)) for b in a: b = b * 100 master_set.append( link.replace("offset=0", "offset=" + str(b))) def xmessages(link): f_offset_count = 0 while True: y = json_request(session, link) if "list" in y: if y["list"]: master_set.append(link) if y["hasMore"]: f_offset_count2 = f_offset_count + 100 f_offset_count = f_offset_count2 - 100 link = link.replace( "offset=" + str(f_offset_count), "offset=" + str(f_offset_count2)) f_offset_count = f_offset_count2 else: break else: break else: break def process_chats(subscriber): fool = subscriber["withUser"] fool_id = str(fool["id"]) link_2 = "https://onlyfans.com/api2/v2/chats/"+fool_id + \ "/messages?limit=100&offset=0&order=desc&app-token="+app_token+"" xmessages(link_2) if api_type == "Messages": xmessages(link) if api_type == "Mass Messages": results = [] max_threads = multiprocessing.cpu_count() offset_count = 0 offset_count2 = max_threads while True: def process_messages(link, session): y = json_request(session, link) if y and "error" not in y: return y else: return [] link_list = [ link.replace("offset=0", "offset=" + str(i * 30)) for i in range(offset_count, offset_count2) ] link_list = pool.starmap(process_messages, product(link_list, [session])) if all(not result for result in link_list): break link_list2 = list(chain(*link_list)) results.append(link_list2) offset_count = offset_count2 offset_count2 = offset_count * 2 unsorted_messages = list(chain(*results)) unsorted_messages.sort(key=lambda x: x["id"]) messages = unsorted_messages def process_mass_messages(message, limit): text = message["textCropped"].replace("&", "") link_2 = "https://onlyfans.com/api2/v2/chats?limit="+limit+"&offset=0&filter=&order=activity&query=" + \ text+"&app-token="+app_token y = json_request(session, link_2) if None == y or "error" in y: return [] return y limit = "10" if len(messages) > 99: limit = "2" subscribers = pool.starmap(process_mass_messages, product(messages, [limit])) subscribers = filter(None, subscribers) subscribers = [ item for sublist in subscribers for item in sublist["list"] ] seen = set() subscribers = [ x for x in subscribers if x["withUser"]["id"] not in seen and not seen.add(x["withUser"]["id"]) ] x = pool.starmap(process_chats, product(subscribers)) if api_type == "Stories": master_set.append(link) if api_type == "Highlights": r = json_request(session, link) if "error" in r: break for item in r: link2 = "https://onlyfans.com/api2/v2/stories/highlights/" + \ str(item["id"])+"?app-token="+app_token+"" master_set.append(link2) x = pool.starmap( media_scraper, product(master_set, [session], [directories], [username], [api_type])) print results = format_media_set(location[0], x) seen = set() results["valid"] = [ x for x in results["valid"] if x["filename"] not in seen and not seen.add(x["filename"]) ] seen = set() location_directories = [ x["directory"] for x in results["valid"] if x["directory"] not in seen and not seen.add(x["directory"]) ] if results["valid"]: results["valid"] = [ list(g) for k, g in groupby(results["valid"], key=lambda x: x["post_id"]) ] os.makedirs(directory, exist_ok=True) for location_directory in location_directories: os.makedirs(location_directory, exist_ok=True) if results["invalid"]: results["invalid"] = [ list(g) for k, g in groupby(results["invalid"], key=lambda x: x["post_id"]) ] if sort_free_paid_posts: ofsorter.sorter(user_directory, api_type, location[0], results) metadata_set.append(results) media_set.append(results) if export_metadata: metadata_set = [x for x in metadata_set if x["valid"] or x["invalid"]] for item in metadata_set: if item["valid"] or item["invalid"]: legacy_metadata = os.path.join(user_directory, api_type, "Metadata") if delete_legacy_metadata: if os.path.isdir(legacy_metadata): shutil.rmtree(legacy_metadata) if metadata_set: os.makedirs(metadata_directory, exist_ok=True) archive_directory = os.path.join(metadata_directory, api_type) export_archive(metadata_set, archive_directory, json_settings) return [media_set, directory]
def prepare_scraper(sessions, site_name, item): api_type = item["api_type"] api_array = item["api_array"] link = api_array["api_link"] locations = api_array["media_types"] username = api_array["username"] directory = api_array["directory"] api_count = api_array["post_count"] master_set = [] media_set = [] metadata_set = [] pool = multiprocessing() formatted_directories = main_helper.format_directories( j_directory, site_name, username, locations, api_type) model_directory = formatted_directories["model_directory"] metadata_directory = formatted_directories["metadata_directory"] api_directory = formatted_directories["api_directory"] if api_type == "Posts": ceil = math.ceil(api_count / 100) a = list(range(ceil)) for b in a: b = b * 100 master_set.append(link.replace("offset=0", "offset=" + str(b))) if api_type == "Archived": ceil = math.ceil(api_count / 100) a = list(range(ceil)) for b in a: b = b * 100 master_set.append(link.replace("offset=0", "offset=" + str(b))) if api_type == "Stories": master_set.append(link) if api_type == "Highlights": r = main_helper.json_request(sessions[0], link) if "error" in r: return for item in r["list"]: link2 = "https://stars.avn.com/api2/v2/stories/collections/" + \ str(item["id"]) master_set.append(link2) master_set2 = main_helper.assign_session(master_set, sessions) media_set = {} media_set["set"] = [] media_set["found"] = False count = len(master_set2) max_attempts = 100 for attempt in list(range(max_attempts)): print("Scrape Attempt: " + str(attempt + 1) + "/" + str(max_attempts)) media_set2 = pool.starmap( media_scraper, product(master_set2, [sessions], [formatted_directories], [username], [api_type])) media_set["set"].extend(media_set2) faulty = [x for x in media_set2 if not x] if not faulty: print("Found: " + api_type) media_set["found"] = True break else: if count < 2: break num = len(faulty) * 100 print("Missing " + str(num) + " Posts... Retrying...") master_set2 = main_helper.restore_missing_data( master_set2, media_set2) if not media_set["found"]: print("No " + api_type + " Found.") media_set = media_set["set"] main_helper.delete_empty_directories(api_directory) media_set = [x for x in media_set] media_set = main_helper.format_media_set(media_set) metadata_set = media_set if export_metadata: metadata_set = [x for x in metadata_set if x["valid"] or x["invalid"]] for item in metadata_set: if item["valid"] or item["invalid"]: legacy_metadata = formatted_directories["legacy_metadata"] if delete_legacy_metadata: if os.path.isdir(legacy_metadata): shutil.rmtree(legacy_metadata) if metadata_set: os.makedirs(metadata_directory, exist_ok=True) archive_directory = os.path.join(metadata_directory, api_type) metadata_set_copy = copy.deepcopy(metadata_set) metadata_set = main_helper.filter_metadata(metadata_set_copy) main_helper.export_archive(metadata_set, archive_directory, json_settings) return [media_set, directory]
def prepare_scraper(api: start, site_name, item): authed = api.auth api_type = item["api_type"] api_array = item["api_array"] subscription = api_array["subscription"] media_type = api_array["media_types"] username = api_array["username"] master_set = [] pool = multiprocessing() formatted_directories = format_directories( j_directory, site_name, username, metadata_directory_format, media_type, api_type) legacy_model_directory = formatted_directories["legacy_model_directory"] metadata_directory = formatted_directories["metadata_directory"] download_directory = formatted_directories["download_directory"] if api_type == "Profile": profile_scraper(api, site_name, api_type, username, maximum_length, download_directory) return True if api_type == "Stories": master_set = subscription.get_stories() highlights = subscription.get_highlights() valid_highlights = [] for highlight in highlights: if "error" == highlight: continue highlight = subscription.get_highlights( hightlight_id=highlight["id"]) valid_highlights.append(highlight) master_set.extend(valid_highlights) print if api_type == "Posts": master_set = subscription.get_posts() if api_type == "Archived": master_set = subscription.get_archived(api) if api_type == "Messages": unrefined_set = subscription.get_messages() unrefined_set = process_messages(api, subscription, unrefined_set) mass_messages = getattr(authed, "mass_messages") if subscription.is_me and mass_messages: mass_messages = getattr(authed, "mass_messages") unrefined_set2 = process_mass_messages(api, subscription, metadata_directory, mass_messages) unrefined_set += unrefined_set2 master_set = [unrefined_set] master_set2 = master_set parent_type = "" if "Archived" == api_type: unrefined_set = [] for master_set3 in master_set2: if not isinstance(master_set3, dict): continue parent_type = master_set3["type"] results = master_set3["results"] unrefined_result = pool.starmap(media_scraper, product( results, [api], [formatted_directories], [username], [api_type], [parent_type])) unrefined_set.append(unrefined_result) unrefined_set = list(chain(*unrefined_set)) else: unrefined_set = pool.starmap(media_scraper, product( master_set2, [api], [formatted_directories], [username], [api_type], [parent_type])) unrefined_set = [x for x in unrefined_set] new_metadata = main_helper.format_media_set(unrefined_set) if not new_metadata: print("No "+api_type+" Found.") delattr(subscription.scraped, api_type) if new_metadata: metadata_path = os.path.join( metadata_directory, api_type+".json") api_path = os.path.join(api_type, parent_type) new_metadata_object = process_metadata( api, new_metadata, formatted_directories, subscription, api_type, api_path, metadata_path, site_name) new_metadata_set = new_metadata_object.export() if export_metadata: export_archive(new_metadata_set, metadata_path, json_settings) print return True
def process_mass_messages(api: start, subscription, metadata_directory, mass_messages) -> list: def compare_message(queue_id, remote_messages): for message in remote_messages: if "isFromQueue" in message and message["isFromQueue"]: if queue_id == message["queueId"]: return message print print global_found = [] chats = [] session = api.sessions[0] salt = json_global_settings["random_string"] encoded = f"{session.ip}{salt}" encoded = encoded.encode('utf-8') hash = hashlib.md5(encoded).hexdigest() profile_directory = json_global_settings["profile_directories"][0] profile_directory = os.path.abspath(profile_directory) profile_directory = os.path.join(profile_directory, subscription.username) profile_metadata_directory = os.path.join(profile_directory, "Metadata") mass_message_path = os.path.join( profile_metadata_directory, "Mass Messages.json") chats_path = os.path.join(profile_metadata_directory, "Chats.json") if os.path.exists(chats_path): chats = import_archive(chats_path) date_object = datetime.today() date_string = date_object.strftime("%d-%m-%Y %H:%M:%S") for mass_message in mass_messages: if "status" not in mass_message: mass_message["status"] = "" if "found" not in mass_message: mass_message["found"] = {} if "hashed_ip" not in mass_message: mass_message["hashed_ip"] = "" mass_message["hashed_ip"] = mass_message.get("hashed_ip", hash) mass_message["date_hashed"] = mass_message.get( "date_hashed", date_string) if mass_message["isCanceled"]: continue queue_id = mass_message["id"] text = mass_message["textCropped"] text = html.unescape(text) mass_found = mass_message["found"] if mass_message["found"] or not mass_message["mediaType"]: continue identifier = None if chats: list_chats = chats for chat in list_chats: identifier = chat["identifier"] messages = chat["messages"]["list"] mass_found = compare_message(queue_id, messages) if mass_found: mass_message["found"] = mass_found mass_message["status"] = True break if not mass_found: list_chats = subscription.search_messages(text=text, limit=2) if not list_chats: continue for item in list_chats["list"]: user = item["withUser"] identifier = user["id"] messages = [] print("Getting Messages") keep = ["id", "username"] list_chats2 = [ x for x in chats if x["identifier"] == identifier] if list_chats2: chat2 = list_chats2[0] messages = chat2["messages"]["list"] messages = subscription.get_messages( identifier=identifier, resume=messages) for message in messages: message["withUser"] = { k: item["withUser"][k] for k in keep} message["fromUser"] = { k: message["fromUser"][k] for k in keep} mass_found = compare_message(queue_id, messages) if mass_found: mass_message["found"] = mass_found mass_message["status"] = True break else: item2 = {} item2["identifier"] = identifier item2["messages"] = subscription.get_messages( identifier=identifier) chats.append(item2) messages = item2["messages"]["list"] for message in messages: message["withUser"] = { k: item["withUser"][k] for k in keep} message["fromUser"] = { k: message["fromUser"][k] for k in keep} mass_found = compare_message(queue_id, messages) if mass_found: mass_message["found"] = mass_found mass_message["status"] = True break print print print if not mass_found: mass_message["status"] = False export_archive(chats, chats_path, json_settings) for mass_message in mass_messages: found = mass_message["found"] if found and found["media"]: user = found["withUser"] identifier = user["id"] print date_hashed_object = datetime.strptime( mass_message["date_hashed"], "%d-%m-%Y %H:%M:%S") next_date_object = date_hashed_object+timedelta(days=1) print if mass_message["hashed_ip"] != hash or date_object > next_date_object: print("Getting Message By ID") x = subscription.get_message_by_id( identifier=identifier, identifier2=found["id"], limit=1) new_found = x["result"]["list"][0] new_found["withUser"] = found["withUser"] mass_message["found"] = new_found mass_message["hashed_ip"] = hash mass_message["date_hashed"] = date_string global_found.append(found) print print main_helper.export_archive( mass_messages, mass_message_path, json_settings) return global_found
def prepare_scraper(session, site_name, only_links, link, locations, directory, api_count, username, api_type): seperator = " | " master_set = [] media_set = [] original_link = link for location in locations: pool = ThreadPool() link = original_link print("Scraping [" + str(seperator.join(location[1])) + "]. Should take less than a minute.") array = format_directory(j_directory, site_name, username, location[0], api_type) user_directory = array[0] location_directory = array[2][0][1] metadata_directory = array[1] directories = array[2] + [location[1]] if not master_set: if api_type == "Posts": ceil = math.ceil(api_count / 100) a = list(range(ceil)) for b in a: b = b * 100 master_set.append( link.replace("offset=0", "offset=" + str(b))) if api_type == "Archived": ceil = math.ceil(api_count / 100) a = list(range(ceil)) for b in a: b = b * 100 master_set.append( link.replace("offset=0", "offset=" + str(b))) # def xmessages(link): # f_offset_count = 0 # while True: # y = json_request(session, link) # if "list" in y: # if y["list"]: # master_set.append(link) # if y["hasMore"]: # f_offset_count2 = f_offset_count+100 # f_offset_count = f_offset_count2-100 # link = link.replace( # "offset=" + str(f_offset_count), "offset=" + str(f_offset_count2)) # f_offset_count = f_offset_count2 # else: # break # else: # break # else: # break # def process_chats(subscriber): # fool = subscriber["withUser"] # fool_id = str(fool["id"]) # link_2 = "https://onlyfans.com/api2/v2/chats/"+fool_id + \ # "/messages?limit=100&offset=0&order=desc&app-token="+app_token+"" # xmessages(link_2) # if api_type == "Messages": # xmessages(link) # if api_type == "Mass Messages": # messages = [] # offset_count = 0 # while True: # y = json_request(session, link) # if y: # messages.append(y) # offset_count2 = offset_count+99 # offset_count = offset_count2-99 # link = link.replace( # "offset=" + str(offset_count), "offset=" + str(offset_count2)) # offset_count = offset_count2 # else: # break # messages = list(chain(*messages)) # message_count = 0 # def process_mass_messages(message, limit): # text = message["textCropped"].replace("&", "") # link_2 = "https://onlyfans.com/api2/v2/chats?limit="+limit+"&offset=0&filter=&order=activity&query=" + \ # text+"&app-token="+app_token # y = json_request(session, link_2) # return y # limit = "10" # if len(messages) > 99: # limit = "2" # subscribers = pool.starmap(process_mass_messages, product( # messages, [limit])) # subscribers = [ # item for sublist in subscribers for item in sublist["list"]] # seen = set() # subscribers = [x for x in subscribers if x["withUser"] # ["id"] not in seen and not seen.add(x["withUser"]["id"])] # x = pool.starmap(process_chats, product( # subscribers)) if api_type == "Stories": master_set.append(link) if api_type == "Highlights": r = json_request(session, link) if "error" in r: break for item in r["list"]: link2 = "https://stars.avn.com/api2/v2/stories/collections/" + \ str(item["id"]) master_set.append(link2) x = pool.starmap( media_scraper, product(master_set, [session], [directories], [username], [api_type])) results = format_media_set(location[0], x) seen = set() results["valid"] = [ x for x in results["valid"] if x["filename"] not in seen and not seen.add(x["filename"]) ] if results["valid"]: os.makedirs(directory, exist_ok=True) os.makedirs(location_directory, exist_ok=True) if export_metadata: os.makedirs(metadata_directory, exist_ok=True) archive_directory = os.path.join(metadata_directory, api_type) export_archive([results], archive_directory, json_settings) media_set.append(results) return [media_set, directory]
def prepare_scraper(sessions, site_name, item): api_type = item["api_type"] api_array = item["api_array"] link = api_array["api_link"] locations = api_array["media_types"] username = api_array["username"] directory = api_array["directory"] api_count = api_array["post_count"] master_set = [] media_set = [] metadata_set = [] pool = ThreadPool() formatted_directories = main_helper.format_directories( j_directory, site_name, username, locations, api_type) model_directory = formatted_directories["model_directory"] api_directory = formatted_directories["api_directory"] metadata_directory = formatted_directories["metadata_directory"] legacy_metadata_directory = os.path.join(api_directory, "Metadata") # legacy_metadata = main_helper.legacy_metadata(legacy_metadata_directory) if api_type == "Profile": profile_scraper(link, sessions[0], directory, username) return if api_type == "Posts": num = 100 link = link.replace("limit=0", "limit="+str(num)) original_link = link ceil = math.ceil(api_count / num) a = list(range(ceil)) for b in a: b = b * num master_set.append(link.replace( "offset=0", "offset=" + str(b))) if api_type == "Archived": ceil = math.ceil(api_count / 100) a = list(range(ceil)) for b in a: b = b * 100 master_set.append(link.replace( "offset=0", "offset=" + str(b))) def xmessages(link): f_offset_count = 0 while True: y = main_helper.json_request(sessions[0], link) if not y: return if "list" in y: if y["list"]: master_set.append(link) if y["hasMore"]: f_offset_count2 = f_offset_count+100 f_offset_count = f_offset_count2-100 link = link.replace( "offset=" + str(f_offset_count), "offset=" + str(f_offset_count2)) f_offset_count = f_offset_count2 else: break else: break else: break def process_chats(subscriber): fool = subscriber["withUser"] fool_id = str(fool["id"]) link_2 = f"https://onlyfans.com/api2/v2/chats/{fool_id}/messages?limit=100&offset=0&order=desc&app-token={app_token}" xmessages(link_2) if api_type == "Messages": xmessages(link) if api_type == "Mass Messages": results = [] max_threads = multiprocessing.cpu_count() offset_count = 0 offset_count2 = max_threads while True: def process_messages(link, session): y = main_helper.json_request(session, link) if y and "error" not in y: return y else: return [] link_list = [link.replace( "offset=0", "offset="+str(i*30)) for i in range(offset_count, offset_count2)] link_list = pool.starmap(process_messages, product( link_list, [sessions[0]])) if all(not result for result in link_list): break link_list2 = list(chain(*link_list)) results.append(link_list2) offset_count = offset_count2 offset_count2 = offset_count*2 unsorted_messages = list(chain(*results)) unsorted_messages.sort(key=lambda x: x["id"]) messages = unsorted_messages def process_mass_messages(message, limit): text = message["textCropped"].replace("&", "") link_2 = "https://onlyfans.com/api2/v2/chats?limit="+limit+"&offset=0&filter=&order=activity&query=" + \ text+"&app-token="+app_token y = main_helper.json_request(sessions[0], link_2) if None == y or "error" in y: return [] return y limit = "10" if len(messages) > 99: limit = "2" subscribers = pool.starmap(process_mass_messages, product( messages, [limit])) subscribers = filter(None, subscribers) subscribers = [ item for sublist in subscribers for item in sublist["list"]] seen = set() subscribers = [x for x in subscribers if x["withUser"] ["id"] not in seen and not seen.add(x["withUser"]["id"])] x = pool.starmap(process_chats, product( subscribers)) if api_type == "Stories": master_set.append(link) if api_type == "Highlights": r = main_helper.json_request(sessions[0], link) if "error" in r: return for item in r: link2 = f"https://onlyfans.com/api2/v2/stories/highlights/{item['id']}?app-token={app_token}" master_set.append(link2) master_set2 = main_helper.assign_session(master_set, sessions) media_set = [] count = len(master_set2) max_attempts = 100 for attempt in list(range(max_attempts)): print("Scrape Attempt: "+str(attempt+1)+"/"+str(max_attempts)) media_set2 = pool.starmap(media_scraper, product( master_set2, [sessions], [formatted_directories], [username], [api_type])) media_set.extend(media_set2) if count > 1: faulty = [x for x in media_set2 if not x] if not faulty: print("Found: "+api_type) break else: num = len(faulty)*100 print("Missing "+str(num)+" Posts... Retrying...") master_set2 = main_helper.restore_missing_data( master_set2, media_set2) else: print("No "+api_type+" Found.") break main_helper.delete_empty_directories(api_directory) media_set = [x for x in media_set] media_set = main_helper.format_media_set(media_set) metadata_set = media_set if export_metadata: print metadata_set = [x for x in metadata_set if x["valid"] or x["invalid"]] for item in metadata_set: if item["valid"] or item["invalid"]: legacy_metadata = formatted_directories["legacy_metadata"] if metadata_set: os.makedirs(metadata_directory, exist_ok=True) archive_directory = os.path.join(metadata_directory, api_type) metadata_set_copy = copy.deepcopy(metadata_set) metadata_set = main_helper.filter_metadata(metadata_set_copy) main_helper.export_archive( metadata_set, archive_directory, json_settings) return [media_set, directory]
def prepare_scraper(api, site_name, item): authed = api.auth sessions = api.sessions api_type = item["api_type"] api_array = item["api_array"] link = api_array["api_link"] subscription = api_array["subscription"] locations = api_array["media_types"] username = api_array["username"] directory = api_array["directory"] api_count = api_array["post_count"] master_set = [] media_set = [] metadata_set = [] pool = multiprocessing() formatted_directories = main_helper.format_directories( j_directory, site_name, username, locations, api_type) model_directory = formatted_directories["model_directory"] api_directory = formatted_directories["api_directory"] metadata_directory = formatted_directories["metadata_directory"] archive_directory = os.path.join(metadata_directory, api_type) archive_path = archive_directory+".json" imported = import_archive(archive_path) legacy_metadata_directory = os.path.join(api_directory, "Metadata") if api_type == "Profile": profile_scraper(api, directory, username) return if api_type == "Stories": master_set = subscription.get_stories() highlights = subscription.get_highlights() valid_highlights = [] for highlight in highlights: highlight = subscription.get_highlights( hightlight_id=highlight["id"]) valid_highlights.append(highlight) master_set.extend(valid_highlights) print if api_type == "Posts": master_set = subscription.get_posts() if api_type == "Archived": master_set = subscription.get_archived(api) if api_type == "Messages": unrefined_set = subscription.get_messages() if "list" in unrefined_set: unrefined_set = unrefined_set["list"] if subscription.is_me: mass_messages = authed["mass_messages"] unrefined_set2 = process_mass_message(api, subscription, metadata_directory, mass_messages) unrefined_set += unrefined_set2 print master_set = [unrefined_set] master_set2 = master_set parent_type = "" if "Archived" == api_type: unrefined_set = [] for master_set3 in master_set2: parent_type = master_set3["type"] results = master_set3["results"] unrefined_result = pool.starmap(media_scraper, product( results, [api], [formatted_directories], [username], [api_type], [parent_type])) unrefined_set.append(unrefined_result) unrefined_set = list(chain(*unrefined_set)) else: unrefined_set = pool.starmap(media_scraper, product( master_set2, [api], [formatted_directories], [username], [api_type], [parent_type])) unrefined_set = [x for x in unrefined_set] metadata_set = main_helper.format_media_set(unrefined_set) if not metadata_set: print("No "+api_type+" Found.") delattr(subscription.scraped, api_type) if metadata_set: if export_metadata: os.makedirs(metadata_directory, exist_ok=True) old_metadata = metadata_fixer(archive_directory) old_metadata_set = prepare_metadata(old_metadata).metadata old_metadata_set2 = jsonpickle.encode( old_metadata_set, unpicklable=False) old_metadata_set2 = jsonpickle.decode(old_metadata_set2) metadata_set = compare_metadata(metadata_set, old_metadata_set2) metadata_set = prepare_metadata(metadata_set).metadata metadata_set2 = jsonpickle.encode(metadata_set, unpicklable=False) metadata_set2 = jsonpickle.decode(metadata_set2) metadata_set2 = main_helper.filter_metadata(metadata_set2) metadata_set2 = legacy_metadata_fixer( legacy_metadata_directory, metadata_set2) main_helper.export_archive( metadata_set2, archive_directory, json_settings, legacy_directory=legacy_metadata_directory) else: metadata_set = prepare_metadata(metadata_set).metadata subscription = api.get_subscription(username) subscription.set_scraped(api_type, metadata_set) return [subscription.scraped]