def collect_user_relatinoships_by_user_ids(call, user_ids_config_filepath, output_folder, config): ''' user_ids_config = {"current_ix": 0, "users": ["2969995619"]} ''' apikeys = list(config['apikeys'].values()).pop() user_ids_config = {} with open(os.path.abspath(user_ids_config_filepath), 'r') as user_ids_config_rf: user_ids_config = json.load(user_ids_config_rf) current_ix = user_ids_config['current_ix'] if ('current_ix' in user_ids_config) else 0 user_ids = user_ids_config['users'][current_ix:] total = len(user_ids) for user_id in user_ids: try: twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder = output_folder) twitterCralwer.fetch_user_relationships(call=call, user_id=user_id) current_ix += 1 # one at a time... no choice except Exception as exc: logger.error(exc) logger.error(util.full_stack()) pass user_ids_config['current_ix'] = current_ix flash_cmd_config(user_ids_config, user_ids_config_filepath, output_folder) logger.info('COMPLETED -> (current_ix: [%d/%d])'%(current_ix, total)) logger.info('PAUSE %ds to CONTINUE...'%WAIT_TIME) time.sleep(WAIT_TIME) else: logger.info('[%s] ALL COMPLETED'%(call))
def collect_tweets_by_ids(tweet_ids_config_filepath, output_folder, config): apikeys = list(config['apikeys'].values()).pop() tweet_ids_config = {} with open(os.path.abspath(tweet_ids_config_filepath), 'r') as tweet_ids_config_rf: tweet_ids_config = json.load(tweet_ids_config_rf) max_range = 100 current_ix = tweet_ids_config['current_ix'] if ('current_ix' in tweet_ids_config) else 0 total = len(tweet_ids_config['tweet_ids'][current_ix:]) tweet_id_chuncks = util.chunks(tweet_ids_config['tweet_ids'][current_ix:], max_range) for tweet_ids in tweet_id_chuncks: try: twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder=output_folder) twitterCralwer.lookup_tweets_by_ids(tweet_ids) current_ix += len(tweet_ids) except Exception as exc: logger.error(exc) logger.error( util.full_stack()) # don't care, if Ctrl+c is hit, does not handle it. When you restart, it restarts from the last chunk (too much trouble to handle Ctrl + c). # you will get duplicate tweets, so what... pass tweet_ids_config['current_ix'] = current_ix flash_cmd_config(tweet_ids_config, tweet_ids_config_filepath, output_folder) logger.info('COMPLETED -> (current_ix: [%d/%d])' % (current_ix, total)) logger.info('PAUSE %ds to CONTINUE...' % WAIT_TIME) time.sleep(WAIT_TIME) else: logger.info('[tweets_by_ids] ALL COMPLETED')
def main(): g = GoogleNewsCrawler() g.crawl_topnews('en') #g.crawl_topnews('es') t = TwitterCrawler() t.crawl_news()
def new_crawler(self, node_id, apikeys, config, crawler_proxies=None): mysql_config = config['mysql_config'] logger.info("got mysql_config") handler_configs = [mysql_config] crawler_id = apikeys['app_key'] logger.debug('creating a new crawler: %s' % crawler_id) if not crawler_proxies: crawler_proxies = next(self.proxy_generator) if self.proxy_generator else None #logger.info("write_to_handlers in scheduler: " + str(len(write_to_handlers))) crawler = TwitterCrawler(node_id, crawler_id, copy.copy(apikeys), handler_configs=handler_configs, redis_config=copy.copy(config['redis_config']), proxies=crawler_proxies) if crawler_id in self.crawlers: # self.crawlers[crawler_id].clear() del self.crawlers[crawler_id] self.crawlers[crawler_id] = { 'apikeys': apikeys, 'crawler': crawler, 'crawler_queue': CrawlerQueue(self.node_id, crawler_id, redis_config=copy.copy(config['redis_config'])), 'crawler_proxies': crawler_proxies } logger.info("before starting crawler") crawler.start()
def collect_tweets_by_ids(tweet_ids_config_filepath, output_folder, config): apikeys = list(config['apikeys'].values()).pop() tweet_ids_config = {} with open(os.path.abspath(tweet_ids_config_filepath), 'r') as tweet_ids_config_rf: tweet_ids_config = json.load(tweet_ids_config_rf) max_range = 100 current_ix = tweet_ids_config['current_ix'] if ('current_ix' in tweet_ids_config) else 0 total = len(tweet_ids_config['tweet_ids'][current_ix:]) tweet_id_chuncks = util.chunks(tweet_ids_config['tweet_ids'][current_ix:], max_range) for tweet_ids in tweet_id_chuncks: try: twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder = output_folder) twitterCralwer.lookup_tweets_by_ids(tweet_ids) current_ix += len(tweet_ids) except Exception as exc: logger.error(exc) logger.error(util.full_stack()) #don't care, if Ctrl+c is hit, does not handle it. When you restart, it restarts from the last chunk (too much trouble to handle Ctrl + c). # you will get duplicate tweets, so what... pass tweet_ids_config['current_ix'] = current_ix flash_cmd_config(tweet_ids_config, tweet_ids_config_filepath, output_folder) logger.info('COMPLETED -> (current_ix: [%d/%d])'%(current_ix, total)) logger.info('PAUSE %ds to CONTINUE...'%WAIT_TIME) time.sleep(WAIT_TIME) else: logger.info('[tweets_by_ids] ALL COMPLETED')
def collect_users_all(user_screen_names_ids,deep, output_folder, config): apikeys = list(config['apikeys'].values()).pop() if user_screen_names_ids.isdigit(): call = 'user_id' else: call = 'screen_name' try: users = [user_screen_names_ids.strip('@')] twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS) ret = twitterCralwer.fetch_users(call=call, users=users, output_folder=output_folder) if ret == 'ok': since_id = 1 ret, since_id, remove = twitterCralwer.fetch_user_timeline(user_screen_names_ids, since_id=since_id) if ret == 'ok': call = '/friends/ids' ret = twitterCralwer.fetch_user_relationships(call=call, user_id=user_screen_names_ids.strip('@'),deep=deep) except Exception as exc: logger.error(exc) return 'error' logger.info('PAUSE %ds to CONTINUE...' % WAIT_TIME) time.sleep(WAIT_TIME) return ret
def collect_tweets_by_search_terms(search_configs_filepath, output_folder, config): apikeys = list(config['apikeys'].values()).pop() search_configs = {} with open(os.path.abspath(search_configs_filepath), 'r') as search_configs_rf: search_configs = json.load(search_configs_rf) for search_config_id in itertools.cycle(search_configs): search_config = search_configs[search_config_id] search_terms = [term.lower() for term in search_config['terms']] querystring = '%s' % (' OR '.join('(' + term + ')' for term in search_terms)) since_id = search_config[ 'since_id'] if 'since_id' in search_config else 0 geocode = tuple(search_config['geocode']) if ( 'geocode' in search_config and search_config['geocode']) else None logger.info( 'REQUEST -> (md5(querystring): [%s]; since_id: [%d]; geocode: [%s])' % (util.md5(querystring.encode('utf-8')), since_id, geocode)) try: twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder=output_folder) since_id = twitterCralwer.search_by_query(querystring, geocode=geocode, since_id=since_id) except Exception as exc: logger.error(exc) logger.error(util.full_stack()) pass search_config['since_id'] = since_id search_config['querystring'] = querystring search_config['geocode'] = geocode search_configs[search_config_id] = search_config flash_cmd_config(search_configs, search_configs_filepath, output_folder) logger.info( 'COMPLETED -> (md5(querystring): [%s]; since_id: [%d]; geocode: [%s])' % (util.md5(querystring.encode('utf-8')), since_id, geocode)) logger.info('PAUSE %ds to CONTINUE...' % WAIT_TIME) time.sleep(WAIT_TIME)
def collect_tweets_by_user_ids(user_id, output_folder, config,since_id = 1): apikeys = list(config['apikeys'].values()).pop() try: twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder=output_folder) ret,since_id, remove = twitterCralwer.fetch_user_timeline(user_id, since_id=since_id) except Exception as exc: logger.error(exc) return 'error' logger.info('COMPLETED -> (user_id: [%s]; since_id: [%d]; remove: [%s])' % (user_id, since_id, remove)) logger.info('PAUSE %ds to CONTINUE...' % WAIT_TIME_TREETS) time.sleep(WAIT_TIME_TREETS) return ret
def collect_user_relatinoships_by_user_ids(call, user_id, deep,output_folder, config): apikeys = list(config['apikeys'].values()).pop() try: twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder=output_folder) ret = twitterCralwer.fetch_user_relationships(call=call, user_id=user_id.strip('@'),deep=deep) except Exception as exc: logger.error(exc) return 'error' pass logger.info('PAUSE %ds to CONTINUE...' % WAIT_TIME_RELATIONS) time.sleep(WAIT_TIME_RELATIONS) return ret
def collect_tweets_by_user_ids(users_config_filepath, output_folder, config): apikeys = list(config['apikeys'].values()).pop() users_config = {} with open(os.path.abspath(users_config_filepath), 'r') as users_config_rf: users_config = json.load(users_config_rf) for user_config_id in itertools.cycle(users_config): user_config = users_config[user_config_id] if ('remove' in user_config and user_config['remove']): continue user_id = user_config['user_id'] since_id = user_config['since_id'] if 'since_id' in user_config else 1 logger.info('REQUEST -> (user_id: [%d]; since_id: [%d])' % (user_id, since_id)) remove = False try: twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder=output_folder) since_id, remove = twitterCralwer.fetch_user_timeline( user_id, since_id=since_id) except Exception as exc: logger.error(exc) logger.error(util.full_stack()) pass user_config['since_id'] = since_id user_config['remove'] = remove users_config[user_config_id] = user_config flash_cmd_config(users_config, users_config_filepath, output_folder) logger.info( 'COMPLETED -> (user_id: [%d]; since_id: [%d]; remove: [%s])' % (user_id, since_id, remove)) logger.info('PAUSE %ds to CONTINUE...' % WAIT_TIME) time.sleep(WAIT_TIME)
def crawl_twitter(self, consumer_key, consumer_secret, access_token, access_token_secret, users_txt): """ Run twitter crawler. Args: consumer_key (str) consumer_secret (str) access_token (str) access_token_secret (str) users_txt (str) """ crawler = TwitterCrawler(consumer_key, consumer_secret, access_token, access_token_secret, self._db_interface) accounts_list = self.io.read(users_txt) crawler.run(accounts_list)
def collect_users(user_screen_names_ids, output_folder, config): apikeys = list(config['apikeys'].values()).pop() if user_screen_names_ids.isdigit() : call = 'user_id' else: call = 'screen_name' try: users = [user_screen_names_ids.strip('@')] twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS) ret = twitterCralwer.fetch_users(call=call, users=users,output_folder=output_folder) except Exception as exc: logger.error(exc) return 'error' logger.info('PAUSE %ds to CONTINUE...' % WAIT_TIME_COLLECT_USER) time.sleep(WAIT_TIME_COLLECT_USER) return ret
def main(): tc = TwitterCrawler() user_ids = read_user_ids("user_withId_list.txt") #user_ids = user_ids[0:100] #print user_ids user_profiles = crawl_user_profiles(tc, user_ids) output_user_profiles(user_profiles, "user_data/sample_user_profiles.json") number_of_tweets_to_crawl = 200 user_tweets = crawl_user_tweets(tc, user_ids, number_of_tweets_to_crawl) output_user_tweets(user_tweets, "user_data/sample_user_tweets.json")
def collect_places(call, places_config_filepath, output_folder, config): ''' query: places_config_filepath = {"current_ix": 0, "places": ["Gainesville, FL", "Shanghai, China"]} ip: places_config_filepath = {"current_ix": 0, "places": ["74.125.19.104"]} ''' apikeys = list(config['apikeys'].values()).pop() places_config = {} with open(os.path.abspath(places_config_filepath), 'r') as places_config_rf: places_config = json.load(places_config_rf) current_ix = places_config['current_ix'] if ('current_ix' in places_config) else 0 places = places_config['places'][current_ix:] total = len(places) for place in places: try: twitterCralwer = TwitterCrawler(apikeys=apikeys, oauth2=False, client_args=CLIENT_ARGS, output_folder=output_folder) twitterCralwer.geo_search(call=call, query=place) current_ix += 1 # one at a time... no choice except Exception as exc: logger.error(exc) logger.error(util.full_stack()) pass places_config['current_ix'] = current_ix flash_cmd_config(places_config, places_config_filepath, output_folder) logger.info('COMPLETED -> (current_ix: [%d/%d])' % (current_ix, total)) logger.info('PAUSE %ds to CONTINUE...' % WAIT_TIME) time.sleep(WAIT_TIME) else: logger.info('collect_places_by_[%s] ALL COMPLETED' % (call))
def collect_tweets_by_search_terms(search_configs_filepath, output_folder, config): apikeys = list(config['apikeys'].values()).pop() search_configs = {} with open(os.path.abspath(search_configs_filepath), 'r') as search_configs_rf: search_configs = json.load(search_configs_rf) for search_config_id in itertools.cycle(search_configs): search_config = search_configs[search_config_id] search_terms = [term.lower() for term in search_config['terms']] querystring = '%s'%(' OR '.join('(' + term + ')' for term in search_terms)) since_id = search_config['since_id'] if 'since_id' in search_config else 0 geocode = tuple(search_config['geocode']) if ('geocode' in search_config and search_config['geocode']) else None logger.info('REQUEST -> (md5(querystring): [%s]; since_id: [%d]; geocode: [%s])'%(util.md5(querystring.encode('utf-8')), since_id, geocode)) try: twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder = output_folder) since_id = twitterCralwer.search_by_query(querystring, geocode = geocode, since_id = since_id) except Exception as exc: logger.error(exc) logger.error(util.full_stack()) pass search_config['since_id'] = since_id search_config['querystring'] = querystring search_config['geocode'] = geocode search_configs[search_config_id] = search_config flash_cmd_config(search_configs, search_configs_filepath, output_folder) logger.info('COMPLETED -> (md5(querystring): [%s]; since_id: [%d]; geocode: [%s])'%(util.md5(querystring.encode('utf-8')), since_id, geocode)) logger.info('PAUSE %ds to CONTINUE...'%WAIT_TIME) time.sleep(WAIT_TIME)
def collect_tweets_by_user_ids(users_config_filepath, output_folder, config): apikeys = list(config['apikeys'].values()).pop() users_config = {} with open(os.path.abspath(users_config_filepath), 'r') as users_config_rf: users_config = json.load(users_config_rf) for user_config_id in itertools.cycle(users_config): user_config = users_config[user_config_id] if ('remove' in user_config and user_config['remove']): continue user_id = user_config['user_id'] since_id = user_config['since_id'] if 'since_id' in user_config else 1 logger.info('REQUEST -> (user_id: [%d]; since_id: [%d])'%(user_id, since_id)) remove = False try: twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder = output_folder) since_id, remove = twitterCralwer.fetch_user_timeline(user_id, since_id=since_id) except Exception as exc: logger.error(exc) logger.error(util.full_stack()) pass user_config['since_id'] = since_id user_config['remove'] = remove users_config[user_config_id] = user_config flash_cmd_config(users_config, users_config_filepath, output_folder) logger.info('COMPLETED -> (user_id: [%d]; since_id: [%d]; remove: [%s])'%(user_id, since_id, remove)) logger.info('PAUSE %ds to CONTINUE...'%WAIT_TIME) time.sleep(WAIT_TIME)
def new_crawler(self, node_id, apikeys, config, crawler_proxies=None): mysql_config = config['mysql_config'] logger.info("got mysql_config") handler_configs = [mysql_config] crawler_id = apikeys['app_key'] logger.debug('creating a new crawler: %s' % crawler_id) if not crawler_proxies: crawler_proxies = next( self.proxy_generator) if self.proxy_generator else None #logger.info("write_to_handlers in scheduler: " + str(len(write_to_handlers))) crawler = TwitterCrawler(node_id, crawler_id, copy.copy(apikeys), handler_configs=handler_configs, redis_config=copy.copy( config['redis_config']), proxies=crawler_proxies) if crawler_id in self.crawlers: # self.crawlers[crawler_id].clear() del self.crawlers[crawler_id] self.crawlers[crawler_id] = { 'apikeys': apikeys, 'crawler': crawler, 'crawler_queue': CrawlerQueue(self.node_id, crawler_id, redis_config=copy.copy(config['redis_config'])), 'crawler_proxies': crawler_proxies } logger.info("before starting crawler") crawler.start()
def collect_places(call, places_config_filepath, output_folder, config): ''' query: places_config_filepath = {"current_ix": 0, "places": ["Gainesville, FL", "Shanghai, China"]} ip: places_config_filepath = {"current_ix": 0, "places": ["74.125.19.104"]} ''' apikeys = list(config['apikeys'].values()).pop() places_config = {} with open(os.path.abspath(places_config_filepath), 'r') as places_config_rf: places_config = json.load(places_config_rf) current_ix = places_config['current_ix'] if ('current_ix' in places_config) else 0 places = places_config['places'][current_ix:] total = len(places) for place in places: try: twitterCralwer = TwitterCrawler(apikeys=apikeys, oauth2=False, client_args=CLIENT_ARGS, output_folder = output_folder) twitterCralwer.geo_search(call=call, query=place) current_ix += 1 # one at a time... no choice except Exception as exc: logger.error(exc) logger.error(util.full_stack()) pass places_config['current_ix'] = current_ix flash_cmd_config(places_config, places_config_filepath, output_folder) logger.info('COMPLETED -> (current_ix: [%d/%d])'%(current_ix, total)) logger.info('PAUSE %ds to CONTINUE...'%WAIT_TIME) time.sleep(WAIT_TIME) else: logger.info('collect_places_by_[%s] ALL COMPLETED'%(call))
def main(time_frame, max_request_per_time_frame, mongo_coll, search_params, max_id, termination_function): tcs = TwitterCrawler(time_frame=time_frame, max_requests=max_request_per_time_frame) tcs.connect(mongo_coll) tcs.authenticate("../api_key.json") tcs.set_search_arguments(search_args=search_params) tcs.search_by_query(wait_for=3, current_max_id=max_id, term_func=termination_function) tcs.close()
#!/usr/local/bin/python # -*- coding: utf-8-*- import json from twitter_crawler import TwitterCrawler from twitter_crawler import TextAnalysis from twitter_crawler import load_settings load_settings('dev') candidates = json.load(open('candidates.json')) crawler = TwitterCrawler('candidates.json') tom = TextAnalysis() for candidate in candidates: print " === current candidate: " + candidate["screen_name"] + " === " crawler.get_candidate_info(candidate) print "all candidate info saved" crawler.get_tweets(candidate) print("all candidate teweets saved") for word in candidate["key_words"]: crawler.get_hash_tags(word) print "done" tom.get_text_analysis()
def main(): api_config_json = open("api.json", "r") api_info = load(api_config_json) for v in api_info.values(): if v == "None": print("[!!] A value in config.json file has not been set") print( "[**] Please enter all necessary values in the following setup process" ) setup() args = parse_args() api_info = {} # api_info["consumer_key"] = environ.get("CONSUMER_KEY") # api_info["consumer_secret"] = environ.get("CONSUMER_SECRET") # api_info["access_token_key"] = environ.get("ACCESS_TOKEN_KEY") # api_info["access_token_secret"] = environ.get("ACCESS_TOKEN_SECRET") tw_crawler = TwitterCrawler(api_info["consumer_key"], api_info["consumer_secret"], api_info["access_token_key"], api_info["access_token_secret"]) #tw_crawler.get_timeline() c = args.count try: if c: if args.query: q = (" OR ".join(args.query)) print("\n" + color_out("[++]", "r"), f"Search query: '{q}', Count: {c}\n") tweets = tw_crawler.search(count=c, to_query=q) else: print( "\n" + color_out("[++]", "r"), f"Search query: 'Malware OR Ransomware OR Advanced Persistent Threat OR CVE OR Cyber Threat OR Hacker', Count: {c}\n" ) tweets = tw_crawler.search(count=c) else: if args.query: q = (" OR ".join(args.query)) print("\n" + color_out("[++]", "r"), f"Search query: '{q}', Count: 200\n") tweets = tw_crawler.search(to_query=q) else: print( "\n" + color_out("[++]", "r"), f"Search query: 'Malware OR Ransomware OR Advanced Persistent Threat OR CVE OR Cyber Threat OR Hacker', Count: {c}\n" ) tweets = tw_crawler.search() except error.TweepError: print("Too many requests have been made... Try again in 10-15 minutes") exit(1) tweet_dates = [t.created_on for t in tweets] tweet_locations = [t.location for t in tweets] tweet_usernames = [t.username for t in tweets] tweet_texts = [t.text for t in tweets] if args.console: set_option("display.max_rows", len(tweets)) d = DataFrame({ "Date (UTC)": tweet_dates, "Location": tweet_locations, "Username": tweet_usernames, "Tweet": tweet_texts }) print() print(d) if args.auth_file: print("\n\n\n" + color_out("[**]", "g"), "Uploading results to the specified Google Sheet's Doc...") account = service_account(args.auth_file) sheet = account.open_by_key(args.url_key).sheet1 row = len(sheet.col_values(1)) + 2 for date, loc, user, tweet in zip(tweet_dates, tweet_locations, tweet_usernames, tweet_texts): sheet.update_cell(row, 1, str(date)) sheet.update_cell(row, 2, loc) sheet.update_cell(row, 3, user) sheet.update_cell(row, 4, tweet) row += 1 sleep(0.5) print(color_out("[**]", "p"), "Finished uploading search results to Google Sheet's Doc...")
def main(time_frame, max_request_per_time_frame, mongo_coll, search_params, termination_function): tcs = TwitterCrawler(time_frame=time_frame, max_requests=max_request_per_time_frame) tcs.connect(mongo_coll) tcs.authenticate("../api_key.json") tcs.set_search_arguments(search_args=search_params) tcs.stream_search(delta_t=900, termination_func=termination_function, feedback_time=900) tcs.close()