Exemplo n.º 1
0
 def parse_proxy_details(self, response):
     soup = bs4.BeautifulSoup(response.text, 'lxml')
     tbody = soup.select('table')[0]
     proxy_df = derive_table(tbody)
     proxy_df.columns = ['ip', 'port', 'addr', 'hide', 'type', 'len', 'check_time']
     proxy_df['id'] = proxy_df['ip'] + ':' + proxy_df['port']
     proxy_df = proxy_df[['id', 'ip', 'port', 'type', 'addr']]
     item = ProxyItem()
     item['ip_table'] = proxy_df.to_dict()
     yield item
Exemplo n.º 2
0
 def start_requests(self):
     arrProxys = Proxy.objects.all()
     for index, aProxy in enumerate(arrProxys):
         dicMeta = vars(aProxy)
         del dicMeta['_state']
         aProxyItem = ProxyItem(dicMeta)
         aProxyItem['request_at'] = datetime.datetime.now()
         # yield aProxyItem
         yield Request(url='https://www.baidu.com',
                       meta={'item': aProxyItem},
                       callback=self.parse,
                       dont_filter=True)
Exemplo n.º 3
0
 def parse_ip3366(self, response):
     for i, tr in enumerate(
             response.xpath('//div[@id="list"]/table/tbody/tr')):
         td_list = tr.xpath('./td/text()').extract()
         item = ProxyItem()
         item['ip'] = td_list[0]  # 提取IP
         item['port'] = td_list[1]  # 提取port
         item['type'] = td_list[2]  # 提取type
         item['schemes'] = td_list[3]  # 提取schemes
         item['addr'] = td_list[5]  # 提取addr
         item['speed'] = td_list[6]  # 提取speed
         item['update'] = td_list[7]  # 提取update
         item['ori'] = 'ip3366'  # 提取update
         yield item
Exemplo n.º 4
0
 def parse_66ip(self, response):
     for i, tr in enumerate(
             response.xpath(
                 '//div[@class="footer"]/div[@align="center"]/table/tr')):
         if i != 0:
             td_list = tr.xpath('./td/text()').extract()
             item = ProxyItem()
             item['ip'] = td_list[0]  # 提取IP
             item['port'] = td_list[1]  # 提取port
             item['type'] = td_list[3]  # 提取type
             item['addr'] = td_list[2]  # 提取addr
             item['update'] = td_list[4]  # 提取update
             item['ori'] = '66ip'  # 提取update
             yield item
Exemplo n.º 5
0
 def parse_jiangxianli(self, response):
     for tr in response.xpath(
             '//table[@class="table table-hover table-bordered table-striped"]/tbody/tr'
     ):
         td_list = tr.xpath('./td/text()').extract()
         item = ProxyItem()
         item['ip'] = td_list[1]  # 提取IP
         item['port'] = td_list[2]  # 提取port
         item['type'] = td_list[3]  # 提取type
         item['schemes'] = td_list[4]  # 提取schemes
         item['addr'] = td_list[5]  # 提取addr
         item['speed'] = td_list[7]  # 提取speed
         item['update'] = td_list[8]  # 提取update
         item['ori'] = 'jiangxianli'  # 提取update
         yield item
Exemplo n.º 6
0
 def area_parse(self, response):
     arrTrs = response.css('#footer tr')
     for index, aTr in enumerate(arrTrs):
         if index > 0:
             arrTds = aTr.css(" td")
             sIp = arrTds[0].css("::text").extract()[0]
             iPort = arrTds[1].css("::text").extract()[0]
             sArea = arrTds[2].css("::text").extract()[0]
             sRemark = arrTds[4].css("::text").extract()[0]
             objItem = ProxyItem()
             objItem['ip'] = sIp
             objItem['port'] = iPort
             objItem['area'] = sArea
             objItem['remark'] = sRemark
             yield objItem
Exemplo n.º 7
0
 def parse_89ip(self, response):
     for i, tr in enumerate(
             response.xpath('//table[@class="layui-table"]/tbody/tr')):
         td_list = [
             td.replace(' ', '').replace('\n',
                                         '').replace('\t',
                                                     '').replace('\r', '')
             for td in tr.xpath('./td/text()').extract()
         ]
         item = ProxyItem()
         item['ip'] = td_list[0]  # 提取IP
         item['port'] = td_list[1]  # 提取port
         item['type'] = 'unknow'  # 提取type
         item['addr'] = td_list[2]  # 提取addr
         item['update'] = td_list[4]  # 提取update
         item['ori'] = '89ip'  # 提取update
         yield item
Exemplo n.º 8
0
 def parse_data5u(self, response):
     for i, ul in enumerate(
             response.xpath(
                 '//div[@class="wlist"]/ul/li[@style="text-align:center;"]/ul'
             )):
         if i != 0:
             ul_list = ul.xpath('./span/li/text()').extract()
             item = ProxyItem()
             item['ip'] = ul_list[0]  # 提取IP
             item['port'] = ul_list[1]  # 提取port
             item['type'] = ul_list[2]  # 提取type
             item['schemes'] = ul_list[3]  # 提取schemes
             item['addr'] = ul_list[5]  # 提取addr
             item['speed'] = ul_list[7]  # 提取speed
             item['update'] = ul_list[8]  # 提取update
             item['ori'] = 'data5u'  # 提取update
             yield item
Exemplo n.º 9
0
    def parse_iphai(self, response):
        for tr in response.xpath(
                '//div[@class="table-responsive module"]/table/tr'):
            td_list = [
                td.replace(' ', '').replace('\r\n', '')
                for td in tr.xpath('./td/text()').extract()
            ]

            if len(td_list) == 7:
                item = ProxyItem()
                item['ip'] = td_list[0]  # 提取IP
                item['port'] = td_list[1]  # 提取port
                item['type'] = td_list[2]  # 提取type
                item['schemes'] = td_list[3]  # 提取schemes
                item['addr'] = td_list[4]  # 提取addr
                item['speed'] = td_list[5]  # 提取speed
                item['update'] = td_list[6]  # 提取update
                item['ori'] = 'iphai'  # 提取update
                yield item