Exemplo n.º 1
0
class TestProxy(object):

    def __init__(self):
        self.URL = r'http://www.cctv.com/'
        self.timeout = 3
        self.myLog = LogUtils()
        # self.run()

    # protocol: http        server: http://123.23.45.4:8080
    def link_with_server_port(self, protocol, server) -> bool:
        # self.myLog.info('link_with_server_port currentThreadName=%s' % threading.currentThread().getName())
        opener = urllib.request.build_opener(urllib.request.ProxyHandler({protocol: server}))
        urllib.request.install_opener(opener)
        try:
            response = urllib.request.urlopen(self.URL, timeout=self.timeout)
        except Exception as e:
            self.myLog.warn('使用代理 %s connect failed, exception=%s' % (server, e))
            return False
        else:
            self.myLog.info('成功得到响应数据,响应码%s' % response.code)
            try:
                readResultStr = response.read().decode()  # response.read() 返回的是 bytes 格式,所以需要 decode()
            except Exception as e2:
                self.myLog.warn('%s connect response.read() failed, exception=%s' % (server, e2))
                return False
            if str(response.code) == '200':
                self.myLog.info('%s 请求成功' % self.URL)
                return True
            else:
                self.myLog.info('%s 请求失败' % self.URL)
                return False
Exemplo n.º 2
0
class RandomProxy(object):
    def __init__(self):
        self.proxys = Resource.get_proxy()
        self.myLog = LogUtils()

    def process_request(self, request, spider):
        if len(Resource.get_proxy()) != 0:
            proxy = random.choice(Resource.get_proxy())
            proxy = proxy.replace('\n', '')
            proxy = proxy.strip()
            self.myLog.debug('random choice proxy = %s' % proxy)
            request.meta['proxy'] = proxy  # http代理
Exemplo n.º 3
0
class InterceptorProxy(object):
    def __init__(self):
        self.myLog = LogUtils()

    def process_request(self, request, spider):
        self.myLog.debug(
            'InterceptorProxy request  url=%s  headers=%s  cookies=%s  meta=%s'
            % (request.url, request.headers, request.cookies, request.meta))

    def process_response(self, request, response, spider):
        self.myLog.debug('InterceptorProxy %s 的 response = %s' %
                         (request.url, response))
Exemplo n.º 4
0
 def parse(self, response):
     tempMyLog = LogUtils()
     tempMyLog.info('proxy360Spider')
     subSelector = response.xpath(
         '//div[@class="proxylistitem" and @name="list_proxy_ip"]')
     items = []
     for sub in subSelector:
         item = GetproxyItem()
         item['ip'] = sub.xpath('.//span[1]/text()').extract()[0]
         item['port'] = sub.xpath('.//span[2]/text()').extract()[0]
         item['type'] = sub.xpath('.//span[3]/text()').extract()[0]
         item['location'] = sub.xpath('.//span[4]/text()').extract()[0]
         item['protocol'] = 'HTTP'
         item['source'] = 'proxy360'
         items.append(item)
     return items
Exemplo n.º 5
0
 def __init__(self):
     # 字符串前面加 r 就不用管其中的特殊字符了,即整体转义
     print(
         get_root_path()
     )  # '/Users/imac/MyDir/Project/PyProject/SpiderProject/xiciSpider/xiciSpider'
     self.totalProxyFile = os.path.join(get_root_path(),
                                        'build/proxy.txt')  # 全部代理.txt
     self.aliveProxyFile = os.path.join(get_root_path(),
                                        'build/alive.txt')  # 可用代理.txt
     # self.URL = r'http://www.xicidaili.com/nn/1' # 拿西刺的代理测试西刺的不好使,connect refused
     # self.URL = r'http://www.baidu.com/'
     self.URL = r'http://www.cctv.com/'
     self.threads = 5  # 10
     self.timeout = 3
     self.aliveList = []
     self.myLog = LogUtils()
Exemplo n.º 6
0
class ToTxtPipeline(object):
    myLog = LogUtils()

    def process_item(self, item, spider):
        fileName = os.path.join(get_root_path(), 'build/proxy.txt')
        # self.myLog.info('开始写入到%s,item=%s' % (fileName, item))
        with open(fileName, 'a') as fp:
            fp.write(item['ip'] + '\t')
            fp.write(item['port'] + '\t')
            fp.write(item['protocol'] + '\t')
            fp.write(item['type'] + '\t\t')  # 中文则加两个 \t 防止对不齐
            fp.write(item['location'] + '\t\t')
            fp.write(item['source'] + '\n')
        return item
Exemplo n.º 7
0
class Resource:
    log = LogUtils()

    userAgents = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
        "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    ]

    __PROXIES = []

    @staticmethod
    def get_proxy():

        if len(Resource.__PROXIES) != 0:
            # Resource.log.info('get_proxy 返回长度为:%s' % len(Resource.__PROXIES))
            return Resource.__PROXIES
        else:
            aliveProxyFile = os.path.join(get_root_path(),
                                          'build/alive.txt')  # 可用代理.txt
            if not os.path.exists(aliveProxyFile):
                return Resource.__PROXIES
            with open(aliveProxyFile, 'r') as fp:
                lines = fp.readlines()
                Resource.__PROXIES = lines
                # Resource.log.info('get_proxy 返回长度为:%s' % len(Resource.__PROXIES))
                return lines
Exemplo n.º 8
0
class ToCsvPipeline(object):
    myLog = LogUtils()

    def process_item(self, item, spider):
        csvFileName = os.path.join(get_root_path(), 'build/proxy.csv')
        # self.myLog.info('开始写入到%s,item=%s' % (csvFileName, item))

        with open(csvFileName, 'w+') as file:
            columns = ['ip', 'port', 'protocol', 'type', 'location', 'source']
            csvfile = csv.DictWriter(file, columns)
            # 写入 csv 文件列名
            csvfile.writeheader()
            # 写入行数据
            csvfile.writerow({
                'ip': item['ip'],
                'port': item['port'],
                'protocol': item['protocol'],
                'type': item['type'],
                'location': item['location'],
                'source': item['source']
            })
        return item
Exemplo n.º 9
0
class TestProxy(object):
    def __init__(self):
        # 字符串前面加 r 就不用管其中的特殊字符了,即整体转义
        print(
            get_root_path()
        )  # '/Users/imac/MyDir/Project/PyProject/SpiderProject/xiciSpider/xiciSpider'
        self.totalProxyFile = os.path.join(get_root_path(),
                                           'build/proxy.txt')  # 全部代理.txt
        self.aliveProxyFile = os.path.join(get_root_path(),
                                           'build/alive.txt')  # 可用代理.txt
        # self.URL = r'http://www.xicidaili.com/nn/1' # 拿西刺的代理测试西刺的不好使,connect refused
        # self.URL = r'http://www.baidu.com/'
        self.URL = r'http://www.cctv.com/'
        self.threads = 5  # 10
        self.timeout = 3
        self.aliveList = []
        self.myLog = LogUtils()
        # self.run()

    def run(self):
        with open(self.totalProxyFile, 'r') as fp:
            lines = fp.readlines()

            if len(lines) == 0:
                self.myLog.error('读取 %s 内容是空的,退出程序' % self.totalProxyFile)

            perThreadDataSize = 0
            modResult = len(lines) % self.threads
            if modResult == 0:
                perThreadDataSize = len(lines) / self.threads
            else:
                perThreadDataSize = (len(lines) / self.threads) + 1
            perThreadDataSize = int(perThreadDataSize)
            self.myLog.info(
                'len(lines) = %s, self.threads = %s, perThreadDataSize=%s' %
                (len(lines), self.threads, perThreadDataSize))

            threadList = []
            for index in range(self.threads):

                startIndex = index * perThreadDataSize
                if (index + 1) * perThreadDataSize > len(lines):
                    stopIndex = len(lines)
                else:
                    stopIndex = (index + 1) * perThreadDataSize

                # self.myLog.info('len(lines) = %s, startIndex=%s, stopIndex=%s' % (len(lines), startIndex, stopIndex))
                subLine = lines[startIndex:stopIndex]
                # self.myLog.info('TestProxy run threadIndex=%d,切割 lineList=%s' % (index, subLine))
                tempThread = threading.Thread(target=self.link_with_proxy,
                                              args=(subLine, ))

                self.myLog.debug('开启线程: threadName=%s' % tempThread.name)
                tempThread.start()
                threadList.append(tempThread)
                # tempThread.join()

            for tt in threadList:
                tt.join()

        self.myLog.info('-- thread join 结束,准备写入 alive.txt --')
        if len(self.aliveList) != 0:
            self.myLog.info(
                'self.aliveList 不为空,开始写入 alive.txt, len(aliveList)=%s' %
                len(self.aliveList))
            with open(self.aliveProxyFile, 'w') as fp:
                for i in range(len(self.aliveList)):
                    fp.write(self.aliveList[i])
                    fp.write('\n')
        else:
            self.myLog.error('self.aliveList 是空的,无法写入到 alive.txt')

    # protocol: http        server: http://123.23.45.4:8080
    def link_with_server_port(self, protocol, server) -> bool:
        # self.myLog.info('link_with_server_port currentThreadName=%s' % threading.currentThread().getName())
        opener = urllib.request.build_opener(
            urllib.request.ProxyHandler({protocol: server}))
        urllib.request.install_opener(opener)
        try:
            response = urllib.request.urlopen(self.URL, timeout=self.timeout)
        except Exception as e:
            self.myLog.warn('使用代理 %s connect failed, exception=%s' %
                            (server, e))
            return False
        else:
            self.myLog.info('成功得到响应数据,响应码%s' % response.code)
            try:
                readResultStr = response.read().decode(
                )  # response.read() 返回的是 bytes 格式,所以需要 decode()
            except Exception as e2:
                self.myLog.warn(
                    '%s connect response.read() failed, exception=%s' %
                    (server, e2))
                return False
            if str(response.code) == '200':
                self.myLog.info('%s 请求成功' % self.URL)
                return True
            else:
                self.myLog.info('%s 请求失败' % self.URL)
                return False

    def link_with_proxy(self, lineList):
        # self.myLog.info('linkWithProxy line=%s' % line)
        for line in lineList:
            if line.find('HTTP') == -1:
                self.myLog.warn('发现了畸形数据: %s, threadName=%s' %
                                (line, threading.currentThread().getName()))
                return
            lineList = line.split('\t')
            protocol = lineList[2].lower()
            ip_port = lineList[0] + ':' + lineList[1]
            server = protocol + r'://' + ip_port  # http://175.42.158.71:9999
            result = self.link_with_server_port(protocol, server)
            if result:
                self.myLog.info('aliveList 开始 append 数据: %s' % server)
                self.aliveList.append(
                    server)  # list.append() 是线程安全的,这里只添加 ip 地址:端口号
Exemplo n.º 10
0
 def __init__(self):
     self.URL = r'http://www.cctv.com/'
     self.timeout = 3
     self.myLog = LogUtils()
Exemplo n.º 11
0
 def __init__(self):
     self.proxys = Resource.get_proxy()
     self.myLog = LogUtils()
Exemplo n.º 12
0
class ProxyXiciSpider(scrapy.Spider):

    name = "proxy_xici_spider"
    allowed_domains = ["xicidaili.com"]
    # nn: 国内高匿代理		nt: 国内普通代理	wn: 国内 HTTPS 代理		wt: 国内 HTTP 代理
    wds = ['nn', 'nt', 'wn', 'wt']
    pages = 20
    sleepTime = 10
    start_urls = []
    myLog = LogUtils()

    for type in wds:
        for i in range(1, pages + 1):
            start_urls.append('http://www.xicidaili.com/' + type + '/' +
                              str(i))

    def start_requests(self):
        self.start_urls = ['http://www.xicidaili.com/nn/1']
        for url in self.start_urls:
            self.myLog.info('开始请求: request url = %s' % url)
            sleep(self.sleepTime)  # 休眠, 防止封ip , 设置了 DOWNLOAD_DELAY 这里貌似不设置也没事
            yield scrapy.Request(url,
                                 callback=self.parse,
                                 errback=self.errback_httpbin,
                                 dont_filter=True)

    def errback_httpbin(self, failure):
        # log all failures
        self.myLog.info('响应失败, {}'.format(repr(failure)))
        self.myLog.info(repr(failure))

        if failure.check(HttpError):
            response = failure.value.response
            self.myLog.info('HttpError错误 on %s', response.url)

        elif failure.check(DNSLookupError):
            # this is the original request
            request = failure.request
            self.myLog.info('DNSLookupError错误 on %s', request.url)

        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            self.myLog.info('TimeoutError错误 on %s', request.url)

    def parse(self, response):
        self.myLog.info('响应成功, url={}'.format(response.url))
        for ele in self.process_data(response):
            yield ele

    def process_data(self, response):
        # self.myLog.debug('ProxyXiciSpider parse response=%s' % (response.text))

        subSelector = response.xpath('//tr[@class=""]|//tr[@class="odd"]')
        items = []
        for sub in subSelector:
            item = GetproxyItem()

            item['ip'] = sub.xpath('.//td[2]/text()').extract()[0]
            item['port'] = sub.xpath('.//td[3]/text()').extract()[0]
            item['type'] = sub.xpath('.//td[5]/text()').extract()[0]
            if sub.xpath('.//td[4]/a/text()'):
                item['location'] = sub.xpath('//td[4]/a/text()').extract()[0]
            else:
                item['location'] = sub.xpath('.//td[4]/text()').extract()[0]
            item['protocol'] = sub.xpath('.//td[6]/text()').extract()[0]
            item['source'] = 'xicidaili'
            items.append(item)
        return items
Exemplo n.º 13
0
 def __init__(self):
     self.myLog = LogUtils()