def test_rate_limit(self): from tweetf0rm.proxies import proxy_checker proxy_list = proxy_checker(self.proxies['proxies']) ps = [] for i, twitter_user in enumerate(self.config['apikeys']): apikeys = self.config['apikeys'][twitter_user] client_args = { "timeout": 300, "proxies": { 'http': '203.156.207.249:8080' } #proxy_list[i]['proxy_dict'] } logger.info(client_args) p = mp.Process(target=call_user_api, args=( apikeys, client_args, )) ps.append(p) p.start() for p in ps: p.join()
def test_rate_limit(self): from tweetf0rm.proxies import proxy_checker proxy_list = proxy_checker(self.proxies['proxies']) ps = [] for i, twitter_user in enumerate(self.config['apikeys']): apikeys = self.config['apikeys'][twitter_user] client_args = { "timeout": 300, "proxies": {'http': '203.156.207.249:8080'} # proxy_list[i]['proxy_dict'] } logger.info(client_args) p = mp.Process(target=call_user_api, args=(apikeys, client_args,)) ps.append(p) p.start() for p in ps: p.join()
def __init__(self, node_id, config={}, proxies=[]): self.node_id = node_id self.config = config if (proxies and len(proxies) > 0): self.proxy_list = proxy_checker(proxies) logger.info("number of live proxies: %d" % (len(self.proxy_list))) # each process only get one apikey... if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails. number_of_processes = min(len(self.config['apikeys']), len(self.proxy_list)) # if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy self.proxy_generator = self.split(self.proxy_list, number_of_processes) else: self.proxy_list = None self.proxy_generator = None number_of_processes = 1 logger.info("number of crawlers: %d" % (number_of_processes)) apikey_list = self.config['apikeys'].keys() self.crawlers = {} for idx in range(number_of_processes): try: self.new_crawler(self.node_id, self.config['apikeys'][apikey_list[idx]], config) except Exception as exc: logger.error(exc) pass self.node_coordinator = NodeCoordinator(config['redis_config']) self.node_coordinator.add_node(node_id) logger.info("number of crawlers: %d created" % (number_of_processes))
def __init__(self, node_id, config={}, proxies=[]): self.node_id = node_id self.config = config if (len(proxies) > 0): self.proxy_list = proxy_checker(proxies) logger.info("number of live proxies: %d"%(len(self.proxy_list))) # each process only get one apikey... if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails. number_of_processes = min(len(self.config['apikeys']), len(self.proxy_list)) # if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy self.proxy_generator = self.split(self.proxy_list, number_of_processes) else: self.proxy_list = None self.proxy_generator = None number_of_processes = 1 logger.info("number of crawlers: %d"%(number_of_processes)) apikey_list = self.config['apikeys'].keys() self.crawlers = {} for idx in range(number_of_processes): try: self.new_crawler(self.node_id, self.config['apikeys'][apikey_list[idx]], config) except Exception as exc: logger.error(exc) pass self.node_coordinator = NodeCoordinator(config['redis_config']) self.node_coordinator.add_node(node_id) logger.info("number of crawlers: %d created"%(number_of_processes))
def test_proxy(self): proxies = proxy_checker(self.proxies['proxies']) #logger.info(proxies) logger.info('%d good proxies left'%len(proxies))
urls = re.findall(r'<a href=\'(/en/http-proxy-list/\d+/.*?)\'>', html) urls = set(urls) urls.add('/en/http-proxy-list/') proxies = [] for url in urls: proxies.extend(crawl_spys_ru(url)) # check if there is a proxies.json locally, merge the check results rather than overwrite it if (os.path.exists(os.path.abspath(args.output))): with open(os.path.abspath(args.output), 'rb') as proxy_f: proxies.extend(json.load(proxy_f)['proxies']) ips = [] proxy_list = [] for proxy in proxies: ip = proxy.keys()[0] proxy_type = proxy.values()[0] if (ip not in ips): ips.append(ip) proxy_list.append({ip: proxy_type}) proxies = [p['proxy'] for p in proxy_checker(proxy_list)] logger.info("number of proxies that are still alive: %d" % len(proxies)) with open(os.path.abspath(args.output), 'wb') as proxy_f: json.dump({'proxies': proxies}, proxy_f)
for i in range(5): proxies.extend(crawl_spys_ru(i)) # check if there is a proxies.json locally, merge the check results rather than overwrite it if (os.path.exists(os.path.abspath(args.output))): with open(os.path.abspath(args.output), 'rb') as proxy_f: proxies.extend(json.load(proxy_f)['proxies']) ips = [] proxy_list = [] for proxy in proxies: ip = proxy.keys()[0] proxy_type = proxy.values()[0] if (ip not in ips): ips.append(ip) proxy_list.append({ip: proxy_type}) proxies = [p['proxy'] for p in proxy_checker(proxy_list)] logger.info("number of proxies that are still alive: %d"%len(proxies)) with open(os.path.abspath(args.output), 'wb') as proxy_f: json.dump({'proxies':proxies}, proxy_f)
level=logging.INFO, format= '%(levelname)s-[%(asctime)s][%(module)s][%(funcName)s][%(lineno)d]: %(message)s' ) requests_log = logging.getLogger("requests") requests_log.setLevel(logging.WARNING) import argparse, pickle, os, json, sys, time sys.path.append("..") from tweetf0rm.proxies import proxy_checker if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-p', '--proxies', help="define the location of the output;", default="proxies.json") args = parser.parse_args() with open(os.path.abspath(args.proxies), 'rb') as proxy_f: proxies = json.load(proxy_f)['proxies'] proxies = [proxy['proxy'] for proxy in proxy_checker(proxies)] logger.info('%d live proxies left' % (len(proxies))) with open(os.path.abspath(args.proxies), 'wb') as proxy_f: json.dump({'proxies': proxies}, proxy_f)
logging.basicConfig(level=logging.INFO, format='%(levelname)s-[%(asctime)s][%(module)s][%(funcName)s][%(lineno)d]: %(message)s') requests_log = logging.getLogger("requests") requests_log.setLevel(logging.WARNING) import argparse, pickle, os, json, sys, time sys.path.append("..") from tweetf0rm.proxies import proxy_checker if __name__=="__main__": parser = argparse.ArgumentParser() parser.add_argument('-p', '--proxies', help="define the location of the output;", default="proxies.json") args = parser.parse_args() with open(os.path.abspath(args.proxies), 'rb') as proxy_f: proxies = json.load(proxy_f)['proxies'] proxies = [proxy['proxy'] for proxy in proxy_checker(proxies)] logger.info('%d live proxies left'%(len(proxies))) with open(os.path.abspath(args.proxies), 'wb') as proxy_f: json.dump({'proxies':proxies}, proxy_f)
def test_proxy(self): proxies = proxy_checker(self.proxies['proxies']) #logger.info(proxies) logger.info('%d good proxies left' % len(proxies))