예제 #1
0
    def getProxies(self):
        # 加载 Log 配置
        getLogConfig()

        proxy_model_list = []

        response = super(data5uSpider, self).getProxies()
        selector = etree.HTML(response.text)

        infos = selector.xpath('//ul[@class="l2"]')

        for i, info in enumerate(infos):
            try:
                ip = info.xpath('//ul[@class="l2"]/span[1]/li/text()')[i]  # ip
                port = info.xpath('//ul[@class="l2"]/span[2]/li/text()')[
                    i]  # 端口
                anonymity = info.xpath(
                    '//ul[@class="l2"]/span[3]/li/a/text()')[i]  # 匿名度
                type = info.xpath('//ul[@class="l2"]/span[4]/li/a/text()')[
                    i]  # 类型
                area = info.xpath('//ul[@class="l2"]/span[6]/li/a[1]/text()')[
                    i]  # 地区, 省
                area = area + info.xpath(
                    '//ul[@class="l2"]/span[6]/li/a[2]/text()')[i]  # 地区, 市
                speed = info.xpath('//ul[@class="l2"]/span[8]/li/text()')[
                    i]  # 速度

                if type == 'http' or type == 'https':
                    # print(type + "://" + ip + ":" + port)
                    proxy = proxyModel()
                    proxy.set_ip(ip)
                    proxy.set_port(port)
                    proxy.set_type(type)
                    proxy.set_anonymity(anonymity)
                    proxy.set_area(area)
                    proxy.set_speed(speed)
                    proxy.set_agent(self.agent)
                    proxy.set_survivalTime("")
                    proxy_model_list.append(proxy)
                else:
                    pass
            except Exception as e:
                logging.debug(e)

        logging.debug("抓取 " + self.agent + " 网站共计 " +
                      str(len(proxy_model_list)) + " 个代理")

        return proxy_model_list
예제 #2
0
    def getProxies(self):
        # 加载 Log 配置
        getLogConfig()

        proxy_model_list = []

        response = super(xiciSpider, self).getProxies()
        selector = etree.HTML(response.text)

        infos = selector.xpath('//tr[@class="odd"]')

        for i, info in enumerate(infos):
            try:
                ip = info.xpath('./td[2]/text()')[0]  # ip
                port = info.xpath('./td[3]/text()')[0]  # 端口
                anonymity = info.xpath('./td[5]/text()')[0]  # 匿名度
                type = info.xpath('./td[6]/text()')[0]  # 类型
                area = info.xpath('./td[4]/a/text()')[0]  # 地区
                speed = info.xpath('./td[7]/div/@title')[0]  # 速度
                survivalTime = info.xpath('./td[9]/text()')[0]  # 存活时间

                print(ip + " | " + port + " | " + anonymity + " | " + type +
                      " | " + area + " | " + speed + " | " + survivalTime)

                proxy = proxyModel()
                proxy.set_ip(ip)
                proxy.set_port(port)
                proxy.set_type(type)
                proxy.set_anonymity(anonymity)
                # 处理空地区
                if area is None:
                    proxy.set_area('')
                else:
                    proxy.set_area(area)
                proxy.set_speed(speed)
                proxy.set_agent(self.agent)
                proxy.set_survivalTime(survivalTime)
                proxy_model_list.append(proxy)

            except Exception as e:
                logging.debug(e)

        logging.debug("抓取 " + self.agent + " 网站共计 " +
                      str(len(proxy_model_list)) + " 个代理")

        return proxy_model_list
예제 #3
0
    def getProxies(self):

        # 加载 Log 配置
        getLogConfig()

        proxy_model_list = []

        response = super(ip181Spider, self).getProxies()
        # 这个网站的编码是 gb2312
        response.encoding = 'gb2312'
        selector = etree.HTML(response.text)

        infos = selector.xpath('//div[@class="col-md-12"]/table/tbody/tr')
        for i, info in enumerate(infos):
            try:
                ip = info.xpath('./td[1]/text()')[0]  # ip
                port = info.xpath('./td[2]/text()')[0]  # 端口
                anonymity = info.xpath('./td[3]/text()')[0]  # 匿名度
                type = info.xpath('./td[4]/text()')[0]  # 类型
                speed = info.xpath('./td[5]/text()')[0]  # 速度
                area = info.xpath('./td[6]/text()')[0]  # 地区
                # print(ip + " | " + port + " | " + anonymity + " | " + type + " | " + speed + " | " + area)

                if i == 1:
                    # 把标题过滤掉
                    pass
                else:
                    proxy = proxyModel()
                    proxy.set_ip(ip)
                    proxy.set_port(port)
                    if type == 'HTTP,HTTPS':
                        proxy.set_type('http')
                    else:
                        proxy.set_type(type.lower())
                    proxy.set_anonymity(anonymity)
                    proxy.set_area(area)
                    proxy.set_speed(speed)
                    proxy.set_agent(self.agent)
                    proxy.set_survivalTime("")
                    proxy_model_list.append(proxy)
            except Exception as e:
                logging.debug(e)
        logging.debug("抓取 " + self.agent + " 网站共计 " +
                      str(len(proxy_model_list)) + " 个代理")

        return proxy_model_list
예제 #4
0
    def __init__(self):

        # 加载 Log 配置
        getLogConfig()

        self.__proxyTable = 'proxy'

        self.conn = pymysql.connect(
            host=config.MYSQL_HOST,
            db=config.MYSQL_DBNAME,
            user=config.MYSQL_USER,
            passwd=config.MYSQL_PASSWORD,
            charset='utf8',  # 编码要加上,否则可能出现中文乱码问题
            use_unicode=False)

        with self.conn:
            self.cursor = self.conn.cursor()
예제 #5
0
    def getProxies(self):

        # 加载 Log 配置
        getLogConfig()

        proxy_model_list = []

        response = super(kuaidailiSpider, self).getProxies()

        pattern = re.compile(
            '<tr>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>('
            '.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?</tr>',
            re.S)

        infos = re.findall(pattern, response.text)

        for item in infos:
            try:
                ip = item[0]  # ip
                port = item[1]  # 端口
                anonymity = item[2]  # 匿名度
                type = item[3]  # 类型
                area = item[4]  # 地区
                speed = item[5]  # 速度

                if type == 'HTTP' or type == 'HTTPS':
                    # print(type.lower() + "://" + ip + ":" + port)
                    proxy = proxyModel()
                    proxy.set_ip(ip)
                    proxy.set_port(port)
                    proxy.set_type(type.lower())
                    proxy.set_anonymity(anonymity)
                    proxy.set_area(area)
                    proxy.set_speed(speed)
                    proxy.set_agent(self.agent)
                    proxy.set_survivalTime("")
                    proxy_model_list.append(proxy)
            except Exception as e:
                logging.debug(e)

        logging.debug("抓取 " + self.agent + " 网站共计 " +
                      str(len(proxy_model_list)) + " 个代理")

        return proxy_model_list
예제 #6
0
    def start_spider(self):
        '''
        在同一进程里面执行多个爬虫程序
        '''
        self.process = CrawlerProcess(self.setting)
        #self.process.crawl(Demospider())   # 换成你 scrapy 爬虫的名字,
        self.process.start()

    '''
    停止爬虫和代理池
    '''
    def stop(self):
        self.isRunning = False
        # 关闭资源
        getProxyPoolWorker().stopWork()
        # todo 停止scrapy爬虫?

    '''
    启动爬虫和代理池
    '''
    def start(self):
        self.start_proxyPool()
        self.start_spider()


if __name__ == '__main__':
    # 加载 Log 配置
    getLogConfig()

    manager = SpiderManager()
    manager.start()