def load(self) -> list: ls = [] if self._num is None: return ls if self._context and self._context.logger: self._context.logger.info('SixSixIPProxySpider: loading proxy list.') url = SixSixIPProxySpider._POOL_URL.format(self._num) reg = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)(?=<br />)') try: res = requests.get(url, proxies=self._sys_proxy, timeout=self._timeout) for match in reg.finditer(res.text): try: for protocol in ('http', 'https'): proxy = Proxy() proxy.ip = match.group(1) proxy.port = match.group(2) proxy.protocol = protocol proxy.proxy_url = self.proxy_url(proxy.ip, proxy.port, proxy.protocol) proxy.collect_time = Datetime.now() proxy.local = Config.local ls.append(proxy) except: pass return ls except: if self._context and self._context.logger: self._context.logger.exception('SixSixIPProxySpider: Failed be load proxy list.') raise
def filte(content): soup = BeautifulSoup(content) proxy_list_info = soup.findAll('tr') proxy_list = [] for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') has_get = False proxy = Proxy(); for proxy_td in proxy_tds: td_index += 1 if td_index == 2: has_get = True proxy.ip = proxy_td.text elif td_index == 3: proxy.port = proxy_td.text elif td_index == 4: if not proxy_td.a == None: proxy.location = proxy_td.a.text elif td_index == 5: proxy.anonymous_type = proxy_td.text elif td_index == 6: proxy.proxy_type = proxy_td.text.lower() if has_get: proxy_list.append(proxy) return proxy_list
def load(self) -> list: ls = [] if self._context and self._context.logger: self._context.logger.info('FatezeroProxySpider: loading proxy list.') try: res = requests.get(FatezeroProxySpider._POOL_URL, proxies=self._sys_proxy, timeout=self._timeout) for text in res.text.split('\n'): try: p = json.loads(text, encoding='utf-8') proxy = Proxy() proxy.ip = p['host'] proxy.port = p['port'] proxy.protocol = p['type'] proxy.proxy_url = self.proxy_url(proxy.ip, proxy.port, proxy.protocol) proxy.collect_time = Datetime.now() proxy.local = Config.local ls.append(proxy) except: pass if self._num is None: return ls else: return ls[:self._num] except: if self._context and self._context.logger: self._context.logger.exception('FatezeroProxySpider: Failed be load proxy list.') raise
def filte(content): soup = BeautifulSoup(content) proxy_list_info = soup.findAll('tr') proxy_list = [] for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') has_get = False proxy = Proxy() for proxy_td in proxy_tds: td_index += 1 if td_index == 2: has_get = True proxy.ip = proxy_td.text elif td_index == 3: proxy.port = proxy_td.text elif td_index == 4: if not proxy_td.a == None: proxy.location = proxy_td.a.text elif td_index == 5: proxy.anonymous_type = proxy_td.text elif td_index == 6: proxy.proxy_type = proxy_td.text.lower() if has_get: proxy_list.append(proxy) return proxy_list
async def process_raw_proxy(self, proxy, collector_id): self.logger.debug("processing raw proxy \"{}\"".format(proxy)) try: _, auth_data, domain, port = proxy_validator.retrieve(proxy) except proxy_validator.ValidationError as ex: self.collectors_logger.error( "Collector with id \"{}\" returned bad raw proxy \"{}\". " "Message: {}".format(collector_id, proxy, ex) ) return # don't care about protocol try: proxy = await db.get( Proxy.select().where( Proxy.auth_data == auth_data, Proxy.domain == domain, Proxy.port == port, ) ) if proxy.last_check_time + settings.PROXY_NOT_CHECKING_PERIOD >= time.time(): proxy_short_address = "" if auth_data: proxy_short_address += auth_data + "@" proxy_short_address += "{}:{}".format(domain, port) self.logger.debug( "skipping proxy \"{}\" from collector \"{}\"".format( proxy_short_address, collector_id) ) return except Proxy.DoesNotExist: pass for raw_protocol in range(len(Proxy.PROTOCOLS)): while not self.good_proxies_are_processed: # TODO: find a better way await asyncio.sleep(0.1) new_proxy = Proxy() new_proxy.raw_protocol = raw_protocol new_proxy.auth_data = auth_data new_proxy.domain = domain new_proxy.port = port await self.add_proxy_to_queue(new_proxy, collector_id)
def get_ips(self): for pat in pats: objs = [] headers = { 'content-type': 'application/json', 'User-Agent': random.choice(agents) } for i in range(100): ip_obj = self.session.query(Proxy).order_by( func.random()).first() proxies = { '{type}'.format(type=ip_obj.type): '{type}://{ip}:{port}'.format(type=ip_obj.type, ip=ip_obj.ip, port=ip_obj.port) } url = '{base_url}{pat}{page}'.format(base_url=base_url, pat=pat, page=i) logger.info('Scrapy {url}'.format(url=url)) try: response = requests.get(url, headers=headers, proxies=proxies) if response.status_code == 200: selector = etree.HTML(response.text) for line in selector.xpath( '//table[@id="ip_list"]//tr[@class="odd"]'): proxy_obj = Proxy() proxy_obj.id = str(uuid.uuid1()) proxy_obj.ip = line.xpath('td')[1].xpath( 'text()')[0] proxy_obj.port = line.xpath('td')[2].xpath( 'text()')[0] proxy_obj.type = str( line.xpath('td')[5].xpath('text()') [0]).lower().replace('https', 'http') objs.append(proxy_obj) except: pass self._threads_check(objs)
def filte(content): soup = BeautifulSoup(content) proxy_list_tables = soup.findAll('table') table_index = 0 pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') proxy_list = [] for table in proxy_list_tables: table_index += 1 if table_index == 3: proxy_list_info = table.findAll('tr') for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') proxy = Proxy(); is_proxy = False for proxy_td in proxy_tds: td_index += 1 if td_index == 2: rel_ip_info = re.search(pattern, proxy_td.text) if rel_ip_info: proxy.ip = rel_ip_info.group(0) is_proxy = True elif td_index == 3: if is_proxy: proxy.port = int(proxy_td.text) elif td_index == 4: if is_proxy: if '匿名代理' == proxy_td.text or '高度匿名' == proxy_td.text: proxy.anonymous_type = '高匿' else: proxy.anonymous_type = '透明' elif td_index == 5: if is_proxy: proxy.location = proxy_td.text proxy.proxy_type = 'http' if is_proxy: proxy_list.append(proxy) return proxy_list
def filte(content): soup = BeautifulSoup(content) proxy_list_tables = soup.findAll('table') table_index = 0 pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') proxy_list = [] for table in proxy_list_tables: table_index += 1 if table_index == 3: proxy_list_info = table.findAll('tr') for proxy in proxy_list_info: td_index = 0 proxy_tds = proxy.findAll('td') proxy = Proxy() is_proxy = False for proxy_td in proxy_tds: td_index += 1 if td_index == 2: rel_ip_info = re.search(pattern, proxy_td.text) if rel_ip_info: proxy.ip = rel_ip_info.group(0) is_proxy = True elif td_index == 3: if is_proxy: proxy.port = int(proxy_td.text) elif td_index == 4: if is_proxy: if '匿名代理' == proxy_td.text or '高度匿名' == proxy_td.text: proxy.anonymous_type = '高匿' else: proxy.anonymous_type = '透明' elif td_index == 5: if is_proxy: proxy.location = proxy_td.text proxy.proxy_type = 'http' if is_proxy: proxy_list.append(proxy) return proxy_list
return (False, 0) except: return (False, 0) def check_google(proxy_info): proxy_content = proxy_info.ip + ":" + str(proxy_info.port) proxy = urllib2.ProxyHandler({proxy_info.proxy_type: proxy_content}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) try: time1 = time.time() response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3) title = BeautifulSoup(response.read()).title.text if "Google" == str(title): proxy_info.check_time = str(datetime.now()).split(".")[0] return (True, (time.time() - time1) * 1000) else: return (False, 0) except: return (False, 0) if __name__ == "__main__": proxy = Proxy() proxy.ip = "222.74.6.48" proxy.port = "8000" proxy.proxy_type = "http" default_ip = get_default_ip() print check_anonymous(proxy, default_ip)
return (True, (time.time() - time1) * 1000) return (False, 0) except: return (False, 0) def check_google(proxy_info): proxy_content = proxy_info.ip + ':' + str(proxy_info.port) proxy = urllib2.ProxyHandler({proxy_info.proxy_type : proxy_content}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) try: time1 = time.time() response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3) title = BeautifulSoup(response.read()).title.text if 'Google' == str(title): proxy_info.check_time = str(datetime.now()).split('.')[0] return (True, (time.time() - time1) * 1000) else: return (False, 0) except: return (False, 0) if __name__ == '__main__': proxy = Proxy() proxy.ip = '222.74.6.48' proxy.port = '8000' proxy.proxy_type = 'http' default_ip = get_default_ip() print check_anonymous(proxy, default_ip)
if len(sys.argv) < 2: print("Usage: python3 {} RESULT_FILE".format( sys.argv[0] if len(sys.argv) == 1 else "program")) exit(1) last_check_time = int(time.time()) i = 0 with open(sys.argv[1], 'r') as file: for line in file: try: print("line {}".format(i)) json_proxy = json.loads( base64.b64decode(line.encode()).decode()) # print(json_proxy) proxy = Proxy() proxy.raw_protocol = Proxy.PROTOCOLS.index( json_proxy['protocol']) proxy.auth_data = "" proxy.domain = json_proxy['domain'] proxy.port = json_proxy['port'] proxy.last_check_time = last_check_time last_check_time += 1 proxy.number_of_bad_checks = settings.REMOVE_ON_N_BAD_CHECKS - 5 session.add(proxy) session.commit() except sqlalchemy.exc.IntegrityError: session.rollback() print("proxy {} exists".format(proxy)) i += 1
return (False, 0) except: return (False, 0) def check_google(proxy_info): proxy_content = proxy_info.ip + ':' + str(proxy_info.port) proxy = urllib2.ProxyHandler({proxy_info.proxy_type : proxy_content}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) try: time1 = time.time() response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3) title = BeautifulSoup(response.read()).title.text if 'Google' == str(title): proxy_info.check_time = str(datetime.now()).split('.')[0] return (True, (time.time() - time1) * 1000) else: return (False, 0) except: return (False, 0) if __name__ == '__main__': proxy = Proxy() proxy.ip = '222.74.6.48' proxy.port = '8000' proxy.proxy_type = 'http' default_ip = get_default_ip() print check_anonymous(proxy, default_ip)