def get_info(username): try: browser = init_chromedriver(chrome_options, capabilities) except Exception as exc: print(exc) sys.exit() try: information = [] user_commented_list = [] try: if len(Settings.login_username) != 0: login(browser, Settings.login_username, Settings.login_password) information, user_commented_list = extract_information( browser, username, Settings.limit_amount) except: print("Error with user " + username) sys.exit(1) Datasaver.save_profile_json(username, information) except KeyboardInterrupt: print('Aborted...') finally: browser.delete_all_cookies() browser.close() return information
def main(): if len(sys.argv) < 3: sys.exit('- Please provide profile to crawl and DB path!\n') user_name = sys.argv[1] db_path = sys.argv[2] proxy = None if len(sys.argv) > 3: proxy = sys.argv[3] chrome_options = Options() if proxy is not None: chrome_options.add_argument('--proxy-server=%s' % proxy) chrome_options.add_argument('--dns-prefetch-disable') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--lang=en-US') chrome_options.add_argument('--headless') chrome_options.add_experimental_option('prefs', {'intl.accept_languages': 'en-US'}) capabilities = DesiredCapabilities.CHROME Settings.sleep_time_between_post_scroll = 3 Settings.sleep_time_between_comment_loading = 3 try: browser = init_chromedriver(chrome_options, capabilities) except Exception as exc: print(exc) sys.exit() try: print('Extracting posts links from ' + user_name) try: user_info, indexed_links, preview_images = extract_user_posts_links( browser, user_name, Settings.limit_amount) db = DatabaseAPI(db_path) db.insert_profile(user_name, user_info['bio'], user_info['bio_url'], user_info['alias'], user_info['num_of_posts'], int(user_info['followers']['count']), int(user_info['following']['count']), 1 if user_info['isprivate'] else 0) for link, index in indexed_links.items(): db.insert_post(user_name, link, index, preview_images[link], '', 0, 0) except Exception as e: print("Error with user '{}': {}".format(user_name, e)) sys.exit(1) print("\nFinished crawling profile links.") except KeyboardInterrupt: print('Aborted...') finally: # browser.delete_all_cookies() browser.close()
def find_real_fans(target_user='******'): followers_list = grab_followers(target_user) sleep(30) fan_list = {} try: browser = init_chromedriver(chrome_options, capabilities) except Exception as exc: print(exc) sys.exit() try: login( browser, Settings.login_username, Settings.login_password) for user in followers_list: print('Extracting information from ' + user) try: information = extract_information(browser, user) fan_list[user] = information except BaseException: print("Error with user " + user) sys.exit(1) Datasaver.save_profile_json(user, information) print("\nFinished.\n") except KeyboardInterrupt: print('Aborted...') finally: browser.delete_all_cookies() browser.close() df = pd.DataFrame(columns=['alias', 'private', 'num_posts', 'num_followers', 'num_following']) for id, element in enumerate(fan_list): alias = element is_private = fan_list[element]['isprivate'] num_posts = fan_list[element]['num_of_posts'] num_followers = fan_list[element]['followers']['count'] num_following = fan_list[element]['following']['count'] info = [alias, is_private, num_posts, num_followers, num_following] tmp = pd.DataFrame([info], columns=['alias', 'private', 'num_posts', 'num_followers', 'num_following']) df = df.append(tmp, ignore_index=True) print(id, info) df.to_csv('real_fans_of_{}.csv'.format(target_user), sep='\t', encoding='utf-8') return df
def _create_proxy_browser(proxy_ip=None): chrome_options = Options() if proxy_ip is not None: chrome_options.add_argument('--proxy-server=%s' % proxy_ip) chrome_options.add_argument('--dns-prefetch-disable') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--lang=en-US') chrome_options.add_argument('--headless') chrome_options.add_experimental_option( 'prefs', {'intl.accept_languages': 'en-US'}) capabilities = DesiredCapabilities.CHROME try: return init_chromedriver(chrome_options, capabilities) except Exception as exc: print(exc) sys.exit()
prefs = { 'profile.managed_default_content_settings.images': 2, 'disk-cache-size': 4096 } chromeOptions.add_experimental_option("prefs", prefs) chrome_options.add_argument('--dns-prefetch-disable') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--lang=en-US') chrome_options.add_argument('--headless') chrome_options.add_experimental_option('prefs', {'intl.accept_languages': 'en-US'}) capabilities = DesiredCapabilities.CHROME try: browser = init_chromedriver(chrome_options, capabilities) except Exception as exc: print(exc) sys.exit() try: usernames = get_all_user_names() for username in usernames: print('Extracting information from ' + username) information = [] user_commented_list = [] try: if len(Settings.login_username) != 0: login(browser, Settings.login_username, Settings.login_password)