示例#1
0
    def get_proxies(self):
        # 加载 Log 配置
        get_log_config()

        proxy_model_list = []

        print('正在爬取无忧代理……')

        response = super(Data5uSpider, self).get_proxies()
        selector = etree.HTML(response.text)

        infos = selector.xpath('//ul[@class="l2"]')

        for i, info in enumerate(infos):
            try:
                ip = info.xpath('//ul[@class="l2"]/span[1]/li/text()')[i]  # ip
                port = info.xpath('//ul[@class="l2"]/span[2]/li/text()')[
                    i]  # 端口
                anonymity = info.xpath(
                    '//ul[@class="l2"]/span[3]/li/a/text()')[i]  # 匿名度
                http_type = info.xpath(
                    '//ul[@class="l2"]/span[4]/li/a/text()')[i]  # 类型
                area = info.xpath('//ul[@class="l2"]/span[6]/li/a[1]/text()')[
                    i]  # 地区, 省
                area = area + info.xpath(
                    '//ul[@class="l2"]/span[6]/li/a[2]/text()')[i]  # 地区, 市
                speed = info.xpath('//ul[@class="l2"]/span[8]/li/text()')[
                    i]  # 速度

                print(ip + " | " + port + " | " + anonymity + " | " +
                      http_type + " | " + area + " | " + speed + " | ")

                if http_type == 'http' or http_type == 'https':
                    # print(http_type + "://" + ip + ":" + port)
                    proxy = Proxy()
                    proxy.set_ip(ip)
                    proxy.set_port(port)
                    proxy.set_http_type(http_type)
                    proxy.set_anonymity(anonymity)
                    proxy.set_area(area)
                    proxy.set_speed(speed)
                    proxy.set_agent(self.agent)
                    proxy.set_survival_time("")
                    proxy_model_list.append(proxy)
                else:
                    pass
            except Exception as e:
                logging.debug(e)

        logging.debug("抓取 " + self.agent + " 网站共计 " +
                      str(len(proxy_model_list)) + " 个代理")

        return proxy_model_list
示例#2
0
    def get_proxies(self):

        # 加载 Log 配置
        get_log_config()

        proxy_model_list = []

        print('正在爬取快代理……')

        response = super(KuaidailiSpider, self).get_proxies()

        pattern = re.compile(
            '<tr>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>('
            '.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?</tr>',
            re.S)

        infos = re.findall(pattern, response.text)

        for item in infos:
            try:
                ip = item[0]  # ip
                port = item[1]  # 端口
                anonymity = item[2]  # 匿名度
                http_type = item[3]  # 类型
                area = item[4]  # 地区
                speed = item[5]  # 速度

                print(ip + " | " + port + " | " + anonymity + " | " +
                      http_type + " | " + area + " | " + speed)

                if http_type == 'HTTP' or http_type == 'HTTPS':
                    # print(type.lower() + "://" + ip + ":" + port)
                    proxy = Proxy()
                    proxy.set_ip(ip)
                    proxy.set_port(port)
                    proxy.set_http_type(http_type.lower())
                    proxy.set_anonymity(anonymity)
                    proxy.set_area(area)
                    proxy.set_speed(speed)
                    proxy.set_agent(self.agent)
                    proxy.set_survival_time("")
                    proxy_model_list.append(proxy)
            except Exception as e:
                logging.debug(e)

        logging.debug("抓取 " + self.agent + " 网站共计 " +
                      str(len(proxy_model_list)) + " 个代理")

        return proxy_model_list
示例#3
0
    def get_proxies(self):
        # 加载 Log 配置
        get_log_config()

        proxy_model_list = []

        print('正在爬取西刺代理……')

        response = super(XiciSpider, self).get_proxies()
        selector = etree.HTML(response.text)

        infos = selector.xpath('//tr[@class="odd"]')

        for i, info in enumerate(infos):
            try:
                ip = info.xpath('./td[2]/text()')[0]  # ip
                port = info.xpath('./td[3]/text()')[0]  # 端口
                anonymity = info.xpath('./td[5]/text()')[0]  # 匿名度
                http_type = info.xpath('./td[6]/text()')[0]  # 类型
                area = info.xpath('./td[4]/a/text()')[0]  # 地区
                speed = info.xpath('./td[7]/div/@title')[0]  # 速度
                survival_time = info.xpath('./td[9]/text()')[0]  # 存活时间

                print(ip + " | " + port + " | " + anonymity + " | " +
                      http_type + " | " + area + " | " + speed + " | " +
                      survival_time)

                proxy = Proxy()
                proxy.set_ip(ip)
                proxy.set_port(port)
                proxy.set_http_type(http_type)
                proxy.set_anonymity(anonymity)
                # 处理空地区
                if area is None:
                    proxy.set_area('')
                else:
                    proxy.set_area(area)
                proxy.set_speed(speed)
                proxy.set_agent(self.agent)
                proxy.set_survival_time(survival_time)
                proxy_model_list.append(proxy)

            except Exception as e:
                logging.debug(e)

        logging.debug("抓取 " + self.agent + " 网站共计 " +
                      str(len(proxy_model_list)) + " 个代理")

        return proxy_model_list
示例#4
0
    def get_proxies(self):

        get_log_config()

        proxy_model_list = []

        print('正在爬取西刺代理......')

        response = super(XiciSpider, self).get_proxies()
        selector = etree.HTML(response.text)

        infos = selector.xpath('//tr[@class="odd"]')

        for i, info in enumerate(infos):
            try:
                ip = info.xpath('./td[2]/text()')[0]
                port = info.xpath('./td[3]/text()')[0]
                anonymity = info.xpath('./td[5]/text()')[0]
                http_type = info.xpath('./td[6]/text()')[0]
                area = info.xpath('./td[4]/a/text()')[0]
                speed = info.xpath('./td[7]/div/@title')[0]
                survival_time = info.xpath('./td[9]/text()')[0]

                print(ip + " | " + port + " | " + anonymity + " | " +
                      http_type + " | " + area + " | " + speed + " | " +
                      survival_time)

                proxy = Proxy()
                proxy.set_ip(ip)
                proxy.set_port(port)
                proxy.set_http_type(http_type)
                proxy.set_anonymity(anonymity)
                if area is None:
                    proxy.set_area("")
                else:
                    proxy.set_area(area)

                proxy.set_speed(speed)
                proxy.set_agent(self.agent)
                proxy.set_survival_time(survival_time)
                proxy_model_list.append(proxy)
                print(len(proxy_model_list))
            except Exception as e:
                logging.debug(e)

        logging.debug(f"抓取 {self.agent} 网站共计 {len(proxy_model_list)} 个代理")

        return proxy_model_list
示例#5
0
    def get_proxies(self):

        # 加载 Log 配置
        get_log_config()

        proxy_model_list = []

        print('正在爬取ip181……')

        response = super(Ip181Spider, self).get_proxies()
        # 这个网站的编码是 gb2312
        response.encoding = 'gb2312'
        selector = etree.HTML(response.text)

        infos = selector.xpath('//div[@class="col-md-12"]/table/tbody/tr')
        for i, info in enumerate(infos):
            try:
                ip = info.xpath('./td[1]/text()')[0]  # ip
                port = info.xpath('./td[2]/text()')[0]  # 端口
                anonymity = info.xpath('./td[3]/text()')[0]  # 匿名度
                http_type = info.xpath('./td[4]/text()')[0]  # 类型
                speed = info.xpath('./td[5]/text()')[0]  # 速度
                area = info.xpath('./td[6]/text()')[0]  # 地区
                # print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + speed + " | " + area)

                if i == 1:
                    # 把标题过滤掉
                    pass
                else:
                    proxy = Proxy()
                    proxy.set_ip(ip)
                    proxy.set_port(port)
                    if http_type == 'HTTP,HTTPS':
                        proxy.set_http_type('http')
                    else:
                        proxy.set_http_type(http_type.lower())
                    proxy.set_anonymity(anonymity)
                    proxy.set_area(area)
                    proxy.set_speed(speed)
                    proxy.set_agent(self.agent)
                    proxy.set_survival_time("")
                    proxy_model_list.append(proxy)
            except Exception as e:
                logging.debug(e)
        logging.debug("抓取 " + self.agent + " 网站共计 " +
                      str(len(proxy_model_list)) + " 个代理")

        return proxy_model_list
示例#6
0
    def get_proxies(self):

        get_log_config()

        proxy_model_list = []

        print('正在爬取快代理......')

        response = super(KuaidailiSpider, self).get_proxies()

        pattern = re.compile(
            '<tr>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>('
            '.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?</tr>',
            re.S
        )

        infos = re.findall(pattern, response.text)

        for item in infos:
            try:
                ip = item[0]
                port = item[1]
                anonymity = item[2]
                http_type = item[3]
                area = item[4]
                speed = item[5]

                print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + area + " | " + speed)

                if http_type == 'HTTP' or http_type == 'HTTPS':
                    proxy = Proxy()
                    proxy.set_ip(ip)
                    proxy.set_port(port)
                    proxy.set_http_type(http_type.lower())
                    proxy.set_anonymity(anonymity)
                    proxy.set_area(area)
                    proxy.set_speed(speed)
                    proxy.set_agent(self.agent)
                    proxy.set_survival_time("")
                    proxy_model_list.append(proxy)
            except Exception as e:
                logging.debug(e)

        logging.debug(f"抓取 {self.agent} 网站共计 {len(proxy_model_list)} 个代理")

        return proxy_model_list
示例#7
0
文件: testCase.py 项目: fuzz1/spider
def case_10():

    dao = ProxyDBManager()
    dao.create_proxy_table()

    proxy = Proxy()

    ip = '125.115.141.6'
    port = 8118
    http_type = 'HTTPS'
    anonymity = '高匿'
    area = '浙江宁波'
    speed = '0.148秒'
    agent = 'agent'
    survival_time = '4小时'

    proxy.set_ip(ip)
    proxy.set_port(port)
    proxy.set_type(http_type)
    proxy.set_anonymity(anonymity)
    # 处理空地区
    if area is None:
        proxy.set_area('')
    else:
        proxy.set_area(area)
    proxy.set_speed(speed)
    proxy.set_agent(agent)
    proxy.set_survival_time(survival_time)

    dao.insert_proxy_table(proxy)

    proxy_address = dao.select_random_proxy()
    print(proxy_address)

    if 'http://' in proxy_address:
        proxy_address = proxy_address.replace('http://', '')
    else:
        proxy_address = proxy_address.replace('https://', '')

    old_ip = proxy_address.split(':')[0]
    print('old IP : ', old_ip)
    dao.plus_proxy_faild_time(old_ip)