def start_datascraper(api, identifier, site_name, choice_type=None): print("Scrape Processing") info = link_check(sessions[0], identifier) user = info["user"] user = json.loads(json.dumps( user), object_hook=lambda d: SimpleNamespace(**d)) if not info["exists"]: info["user"] = user return [False, info] is_me = user.is_me post_counts = info["count"] post_count = post_counts[0] user_id = str(user.id) avatar = user.avatar username = user.username link = user.link info["download"] = prepare_download.start( username=username, link=link, image_url=avatar, post_count=post_count, webhook=webhook) if not info["subbed"]: print(f"You are not subbed to {user.username}") return [False, info] print("Name: "+username) api_array = scrape_choice(user_id, post_counts, is_me) api_array = format_options(api_array, "apis") apis = api_array[0] api_string = api_array[1] if not json_settings["auto_scrape_apis"]: print("Apis: "+api_string) value = int(input().strip()) else: value = 0 if value: apis = [apis[value]] else: apis.pop(0) info["download"] = prepare_download.start( username=username, link=link, image_url=avatar, post_count=post_count, webhook=webhook) for item in apis: print("Type: "+item["api_type"]) only_links = item["api_array"]["only_links"] post_count = str(item["api_array"]["post_count"]) item["api_array"]["username"] = username api_type = item["api_type"] results = prepare_scraper( sessions, site_name, item) if results: for result in results[0]: if not only_links: media_set = result if not media_set["valid"]: continue directory = results[1] location = result["type"] info["download"].others.append( [media_set["valid"], sessions, directory, username, post_count, location, api_type]) # When profile is done scraping, this function will return True print("Scrape Completed"+"\n") return [True, info]
def start_datascraper(session, board_name, site_name, link_type, choice_type=None): print("Scrape Processing") info = link_check(session, board_name) if not info["exists"]: return [False, info] print("Board: " + board_name) array = scrape_choice(board_name) pool = multiprocessing() threads = board_scraper(session, array[0], "") archive_threads = board_scraper(session, array[1], "archive") threads = threads + archive_threads print("Original Count: "+str(len(threads))) formatted_directories = main_helper.format_directories( j_directory, site_name, board_name) model_directory = formatted_directories["model_directory"] metadata_directory = formatted_directories["metadata_directory"] api_directory = formatted_directories["api_directory"] directory = model_directory print("Scraping Threads") threads = pool.starmap(thread_scraper, product(threads, [board_name], [session], [directory])) threads = [x for x in threads if x is not None] post_count = len(threads) print("Valid Count: "+str(post_count)) print("Downloading Media") count_results = str(len([x for x in threads if x is None])) print("Invalid Count: "+count_results) num = random.randrange(0, 200) avatar = f"https://s.4cdn.org/image/title/{num}.png" link = info["link"] info["download"] = prepare_download.start( username=board_name, link=link, image_url=avatar, post_count=post_count, webhook=webhook) info["download"].others.append([threads, session, directory, board_name]) # When profile is done scraping, this function will return True return [True, info]
def start_datascraper(api, identifier, site_name, choice_type=None): print("Scrape Processing") subscription = api.get_subscription(identifier) if not subscription: return [False, subscription] post_count = subscription.postsCount user_id = str(subscription.id) avatar = subscription.avatar username = subscription.username link = subscription.link formatted_directories = main_helper.format_directories( j_directory, site_name, username) metadata_directory = formatted_directories["metadata_directory"] archive_path = os.path.join(metadata_directory, "Mass Messages.json") if subscription.is_me: imported = import_archive(archive_path) mass_messages = api.get_mass_messages(resume=imported) export_archive(mass_messages, archive_path, json_settings, rename=False) info = {} info["download"] = prepare_download.start( username=username, link=link, image_url=avatar, post_count=post_count, webhook=webhook) print("Name: "+username) api_array = scrape_choice(api, subscription) api_array = format_options(api_array, "apis") apis = api_array[0] api_string = api_array[1] if not json_settings["auto_scrape_apis"]: print("Apis: "+api_string) value = int(input().strip()) else: value = 0 if value: apis = [apis[value]] else: apis.pop(0) for item in apis: print("Type: "+item["api_type"]) only_links = item["api_array"]["only_links"] post_count = str(item["api_array"]["post_count"]) item["api_array"]["username"] = username item["api_array"]["subscription"] = subscription api_type = item["api_type"] results = prepare_scraper( api, site_name, item) print if any(x for x in subscription.scraped): subscription.download_info["directory"] = j_directory subscription.download_info["model_directory"] = os.path.join( j_directory, username) subscription.download_info["webhook"] = webhook print("Scrape Completed"+"\n") return [True, info]
def start_datascraper(api, identifier, site_name, choice_type=None): print("Scrape Processing") subscription = api.get_subscription(identifier) if not subscription: return [False, subscription] post_count = subscription.postsCount user_id = str(subscription.id) avatar = subscription.avatar username = subscription.username link = subscription.link info = {} info["download"] = prepare_download.start(username=username, link=link, image_url=avatar, post_count=post_count, webhook=webhook) print("Name: " + username) api_array = scrape_choice(api, subscription) api_array = format_options(api_array, "apis") apis = api_array[0] api_string = api_array[1] if not json_settings["auto_scrape_apis"]: print("Apis: " + api_string) value = int(input().strip()) else: value = 0 if value: apis = [apis[value]] else: apis.pop(0) metadata_locations = {} for item in apis: print("Type: " + item["api_type"]) only_links = item["api_array"]["only_links"] post_count = str(item["api_array"]["post_count"]) item["api_array"]["username"] = username item["api_array"]["subscription"] = subscription api_type = item["api_type"] results = prepare_scraper(api, site_name, item) print("Scrape Completed" + "\n") return [True, info]