示例#1
0
    def get_proxy(self, url):
        r = requests.get(
            url,
            headers={'User-Agent': random.choice(self.user_agent)},
            timeout=30)
        html = etree.HTML(r.content)

        all_proxy = html.xpath('//table//tr[td]')
        logs.debug('-----all_proxy_num-----%s-----' % len(all_proxy))
        for i in all_proxy:
            ip_port = ''.join(
                i.xpath('./td[1]/span[@style]/text()|'
                        './td[1]/div[@style]/text()|'
                        './td[1]/p[@style]/text()|'
                        './td[1]/text()|'
                        './td[1]/span[@class]/text()'))
            if ip_port and ':' in ip_port:
                ip, _ = ip_port.split(':')
                port_alpha = i.xpath(
                    './td[1]/span[starts-with(@class, "port")]/@class')[0]
                port = self.get_port(port_alpha)
                anonymous = i.xpath('./td[2]/a/text()')[0]
                http_type = ''.join(i.xpath('./td[3]/a/text()')) or 'http'
                country = ''.join(i.xpath('./td[4]/a/text()'))
                from_site = 'goubanjia'
                crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                proxy = (ip, port, country, anonymous, http_type, from_site,
                         crawl_time)
                yield proxy
示例#2
0
 def select(self, sql):
     try:
         self.cursor.execute(sql)
         data = self.cursor.fetchall()
         return data
     except Exception as e:
         logs.debug('-----select failed-----%s' % e)
         return ''
示例#3
0
 def insert(self, sql):
     try:
         self.cursor.execute(sql)
         self.db.commit()
         return True
     except Exception as e:
         logs.debug('-----insert failed-----%s' % e)
         return False
示例#4
0
 def start(self):
     """
     多进程实现检查,因为网络I/O比较费时。但是小心,这里有个错误。在有些python2中运行错误
     """
     ip_list = self.get_proxy()
     pool = Pool()
     pool.map(self.check_proxy, ip_list)
     pool.close()
     logs.debug('-----new proxy num-----%s--' % self.proxy_num)
示例#5
0
 def insert_proxy_to_mysql(self, proxy):
     if not self.check_if_exist(proxy):
         (ip, port, country, anonymous, http_type, from_site, crawl_time) = proxy
         ip_md5 = transform_md5(ip + port + http_type)
         insert_sql = 'insert into ip_t(ip, port, country, anonymous, http_type, from_site, crawl_time, ' \
                      'ip_md5) VALUES ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s");' % \
                      (ip, port, country, anonymous, http_type, from_site, crawl_time, ip_md5)
         logs.debug('-----insert_sql-----%s' % insert_sql)
         succeed = self.insert(insert_sql)
         if not succeed:
             logs.debug('-----insert failed-----%s' % insert_sql)
示例#6
0
 def get_ip(self, url):
     r = requests.get(url, headers=self.headers)
     all_ip_port = self.ip_pattern.findall(r.content)
     logs.debug('-----all ip num-----%s-----' % len(all_ip_port))
     for ip_port in all_ip_port:
         (ip, port) = ip_port
         if all((ip, port)):
             http_type = 'http'
             anonymous = 'HighAnonymous' if 'highanon' in url else 'Unknown'
             country = 'Unknown'
             from_site = url.split('/')[2]
             crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
             yield (ip, port, country, anonymous, http_type, from_site,
                    crawl_time)
示例#7
0
    def parse_html(self, html):
        all_proxy = html.xpath('//table//tr[td]')
        logs.debug('-----all ip num-----%s-----' % len(all_proxy))
        for i in all_proxy:
            ip_port = i.xpath('./td[1]/script/text()')[0]

            ip = self.ip_part_1_p.findall(ip_port)[0][::-1] + self.ip_part_2_p.findall(ip_port)[0]
            port = str(eval(self.port_p.findall(ip_port)[0]))

            http_type = i.xpath('./td[2]/text()')[0].strip()
            country = i.xpath('./td[3]/abbr/text()')[0]
            anonymous = i.xpath('./td[4]/span/text()')[0]
            from_site = 'proxydb.net'
            crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            proxy = (ip, port, country, anonymous, http_type, from_site, crawl_time)
            yield proxy
示例#8
0
 def get_ip(self, url):
     r = requests.get(url, headers=self.headers)
     html = etree.HTML(r.content)
     all_ip = html.xpath('//table//tr[not(@class="active")]')
     logs.debug('-----all ip num-----%s-----' % len(all_ip))
     for i in all_ip:
         ip = i.xpath('./td[1]/text()')[0]
         port = i.xpath('./td[2]/text()')[0]
         country = i.xpath('./td[6]/text()')[0]
         anonymous = i.xpath('./td[3]/text()')[0]
         http_type = i.xpath('./td[4]/text()')[0]
         from_site = 'ip181'
         crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
         print('-----', ip, port, country, anonymous, http_type, from_site,
               crawl_time, '=====')
         proxy = (ip, port, country, anonymous, http_type, from_site,
                  crawl_time)
         yield proxy
示例#9
0
 def get_html(self, url):
     try:
         c = pycurl.Curl()
         s = StringIO()
         c.setopt(c.USERAGENT, random.choice(self.user_agent))
         c.setopt(c.URL, url)
         c.setopt(c.CONNECTTIMEOUT, 15)
         c.setopt(c.TIMEOUT, 15)
         c.setopt(c.WRITEDATA, s)
         c.perform()
         time.sleep(3)
         http_status = c.getinfo(pycurl.HTTP_CODE)
         logs.debug('-----http_status: %s-----url: %s-----' % (http_status, url))
         c.close()
         body = s.getvalue()
         html = etree.HTML(body)
         return html
     except Exception as e:
         logs.debug('-----request error-----%s' % e)
         return None
示例#10
0
 def extract_proxy(self, page_num):
     try:
         r = requests.get(self.url.format(page=page_num),
                          headers=self.headers,
                          timeout=10)
         html = etree.HTML(r.content)
         all_proxy = html.xpath('//table//tr[td]')
         logs.debug('-----all ip num-----%s-----' % len(all_proxy))
         for proxy in all_proxy:
             try:
                 ip_base64 = self.ip_patter.findall(
                     proxy.xpath('./td[1]/script/text()')[0])[0]
                 ip = base64.b64decode(codecs.decode(ip_base64, 'rot-13'))
                 port = proxy.xpath('./td[2]/text()')[0]
                 country = proxy.xpath('./td[4]/text()')[0]
                 anonymous = proxy.xpath('./td[6]/text()')[0]
                 http_type = 'http'
                 from_site = 'cool-proxy'
                 crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                 yield (ip, port, country, anonymous, http_type, from_site,
                        crawl_time)
             except Exception as e:
                 logs.debug('-----no ip-----%s' % e)
     except Exception as e:
         logs.debug('-----request error-----%s' % e)
示例#11
0
 def check_proxy(self, proxy):
     """
     通过请求检查ip地址是否能用,
     :param proxy: proxy为代理IP。
     :return: 写到数据库。
     """
     conn = MySQLConn()
     proxy = process_data(proxy)
     (ip, port, country, anonymous, http_type, from_site,
      crawl_time) = proxy
     if not conn.check_if_exist(proxy):
         try:
             c = pycurl.Curl()
             c.setopt(c.URL, self.url)
             c.setopt(c.USERAGENT, random.choice(self.user_agent))
             c.setopt(
                 pycurl.PROXY,
                 '{http_type}://{ip}:{port}'.format(http_type=http_type,
                                                    ip=ip,
                                                    port=port))
             c.setopt(pycurl.CONNECTTIMEOUT, 10)
             c.setopt(pycurl.TIMEOUT, 10)
             c.perform()
             time.sleep(0.1)
             http_status = c.getinfo(pycurl.HTTP_CODE)
             c.close()
             if http_status == 200:
                 self.proxy_num += 1
                 conn.insert_proxy_to_mysql(proxy)
                 logs.debug('-----proxy is useful-----%s://%s:%s----' %
                            (http_type, ip, port))
             else:
                 logs.debug('-----http_status: %s-----proxy: %s--' %
                            (http_status, ip + ':' + port))
         except Exception as e:
             logs.debug('-----proxy can not be used-----%s' % e)
         finally:
             conn.close()
     else:
         conn.close()
         logs.debug('-----proxy already exists-----%s://%s:%s----' %
                    (http_type, ip, port))
示例#12
0
 def close(self,):
     try:
         self.cursor.close()
         self.db.close()
     except Exception as e:
         logs.debug('-----close failed-----%s' % e)
示例#13
0
 def update(self, sql):
     try:
         self.cursor.execute(sql)
         self.db.commit()
     except Exception as e:
         logs.debug('-----update failed-----%s' % e)