def get_proxy(self, url): r = requests.get( url, headers={'User-Agent': random.choice(self.user_agent)}, timeout=30) html = etree.HTML(r.content) all_proxy = html.xpath('//table//tr[td]') logs.debug('-----all_proxy_num-----%s-----' % len(all_proxy)) for i in all_proxy: ip_port = ''.join( i.xpath('./td[1]/span[@style]/text()|' './td[1]/div[@style]/text()|' './td[1]/p[@style]/text()|' './td[1]/text()|' './td[1]/span[@class]/text()')) if ip_port and ':' in ip_port: ip, _ = ip_port.split(':') port_alpha = i.xpath( './td[1]/span[starts-with(@class, "port")]/@class')[0] port = self.get_port(port_alpha) anonymous = i.xpath('./td[2]/a/text()')[0] http_type = ''.join(i.xpath('./td[3]/a/text()')) or 'http' country = ''.join(i.xpath('./td[4]/a/text()')) from_site = 'goubanjia' crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') proxy = (ip, port, country, anonymous, http_type, from_site, crawl_time) yield proxy
def select(self, sql): try: self.cursor.execute(sql) data = self.cursor.fetchall() return data except Exception as e: logs.debug('-----select failed-----%s' % e) return ''
def insert(self, sql): try: self.cursor.execute(sql) self.db.commit() return True except Exception as e: logs.debug('-----insert failed-----%s' % e) return False
def start(self): """ 多进程实现检查,因为网络I/O比较费时。但是小心,这里有个错误。在有些python2中运行错误 """ ip_list = self.get_proxy() pool = Pool() pool.map(self.check_proxy, ip_list) pool.close() logs.debug('-----new proxy num-----%s--' % self.proxy_num)
def insert_proxy_to_mysql(self, proxy): if not self.check_if_exist(proxy): (ip, port, country, anonymous, http_type, from_site, crawl_time) = proxy ip_md5 = transform_md5(ip + port + http_type) insert_sql = 'insert into ip_t(ip, port, country, anonymous, http_type, from_site, crawl_time, ' \ 'ip_md5) VALUES ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s");' % \ (ip, port, country, anonymous, http_type, from_site, crawl_time, ip_md5) logs.debug('-----insert_sql-----%s' % insert_sql) succeed = self.insert(insert_sql) if not succeed: logs.debug('-----insert failed-----%s' % insert_sql)
def get_ip(self, url): r = requests.get(url, headers=self.headers) all_ip_port = self.ip_pattern.findall(r.content) logs.debug('-----all ip num-----%s-----' % len(all_ip_port)) for ip_port in all_ip_port: (ip, port) = ip_port if all((ip, port)): http_type = 'http' anonymous = 'HighAnonymous' if 'highanon' in url else 'Unknown' country = 'Unknown' from_site = url.split('/')[2] crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') yield (ip, port, country, anonymous, http_type, from_site, crawl_time)
def parse_html(self, html): all_proxy = html.xpath('//table//tr[td]') logs.debug('-----all ip num-----%s-----' % len(all_proxy)) for i in all_proxy: ip_port = i.xpath('./td[1]/script/text()')[0] ip = self.ip_part_1_p.findall(ip_port)[0][::-1] + self.ip_part_2_p.findall(ip_port)[0] port = str(eval(self.port_p.findall(ip_port)[0])) http_type = i.xpath('./td[2]/text()')[0].strip() country = i.xpath('./td[3]/abbr/text()')[0] anonymous = i.xpath('./td[4]/span/text()')[0] from_site = 'proxydb.net' crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') proxy = (ip, port, country, anonymous, http_type, from_site, crawl_time) yield proxy
def get_ip(self, url): r = requests.get(url, headers=self.headers) html = etree.HTML(r.content) all_ip = html.xpath('//table//tr[not(@class="active")]') logs.debug('-----all ip num-----%s-----' % len(all_ip)) for i in all_ip: ip = i.xpath('./td[1]/text()')[0] port = i.xpath('./td[2]/text()')[0] country = i.xpath('./td[6]/text()')[0] anonymous = i.xpath('./td[3]/text()')[0] http_type = i.xpath('./td[4]/text()')[0] from_site = 'ip181' crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('-----', ip, port, country, anonymous, http_type, from_site, crawl_time, '=====') proxy = (ip, port, country, anonymous, http_type, from_site, crawl_time) yield proxy
def get_html(self, url): try: c = pycurl.Curl() s = StringIO() c.setopt(c.USERAGENT, random.choice(self.user_agent)) c.setopt(c.URL, url) c.setopt(c.CONNECTTIMEOUT, 15) c.setopt(c.TIMEOUT, 15) c.setopt(c.WRITEDATA, s) c.perform() time.sleep(3) http_status = c.getinfo(pycurl.HTTP_CODE) logs.debug('-----http_status: %s-----url: %s-----' % (http_status, url)) c.close() body = s.getvalue() html = etree.HTML(body) return html except Exception as e: logs.debug('-----request error-----%s' % e) return None
def extract_proxy(self, page_num): try: r = requests.get(self.url.format(page=page_num), headers=self.headers, timeout=10) html = etree.HTML(r.content) all_proxy = html.xpath('//table//tr[td]') logs.debug('-----all ip num-----%s-----' % len(all_proxy)) for proxy in all_proxy: try: ip_base64 = self.ip_patter.findall( proxy.xpath('./td[1]/script/text()')[0])[0] ip = base64.b64decode(codecs.decode(ip_base64, 'rot-13')) port = proxy.xpath('./td[2]/text()')[0] country = proxy.xpath('./td[4]/text()')[0] anonymous = proxy.xpath('./td[6]/text()')[0] http_type = 'http' from_site = 'cool-proxy' crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') yield (ip, port, country, anonymous, http_type, from_site, crawl_time) except Exception as e: logs.debug('-----no ip-----%s' % e) except Exception as e: logs.debug('-----request error-----%s' % e)
def check_proxy(self, proxy): """ 通过请求检查ip地址是否能用, :param proxy: proxy为代理IP。 :return: 写到数据库。 """ conn = MySQLConn() proxy = process_data(proxy) (ip, port, country, anonymous, http_type, from_site, crawl_time) = proxy if not conn.check_if_exist(proxy): try: c = pycurl.Curl() c.setopt(c.URL, self.url) c.setopt(c.USERAGENT, random.choice(self.user_agent)) c.setopt( pycurl.PROXY, '{http_type}://{ip}:{port}'.format(http_type=http_type, ip=ip, port=port)) c.setopt(pycurl.CONNECTTIMEOUT, 10) c.setopt(pycurl.TIMEOUT, 10) c.perform() time.sleep(0.1) http_status = c.getinfo(pycurl.HTTP_CODE) c.close() if http_status == 200: self.proxy_num += 1 conn.insert_proxy_to_mysql(proxy) logs.debug('-----proxy is useful-----%s://%s:%s----' % (http_type, ip, port)) else: logs.debug('-----http_status: %s-----proxy: %s--' % (http_status, ip + ':' + port)) except Exception as e: logs.debug('-----proxy can not be used-----%s' % e) finally: conn.close() else: conn.close() logs.debug('-----proxy already exists-----%s://%s:%s----' % (http_type, ip, port))
def close(self,): try: self.cursor.close() self.db.close() except Exception as e: logs.debug('-----close failed-----%s' % e)
def update(self, sql): try: self.cursor.execute(sql) self.db.commit() except Exception as e: logs.debug('-----update failed-----%s' % e)