def parse(self, response): sel = Selector(response) items = [] if response.url == 'https://www.kernel.org/pub/linux/utils/kernel/ipvsadm/': lvs_lists = sel.xpath('//a/text()').extract() for lvs in lvs_lists: if lvs == '../' or lvs == 'tmp/' or lvs == 'Name' or lvs == 'Last modified' or lvs == 'Description' or lvs == 'Parent Directory' or lvs == 'ChangeLog' or lvs == 'Size': continue item = OpensourceSpiderItem() item['orginname'] = lvs item['downurl'] = response.url + lvs item['filesize'] = 0 items.append(item) else: lvs_lists = sel.xpath( '//div[@id="mainContent"]/ul/li/a/@href').extract() for lvs in lvs_lists: item = OpensourceSpiderItem() item['orginname'] = lvs item['downurl'] = response.url + lvs item['filesize'] = 0 items.append(item) return items
def parse(self, response): sel = Selector(response) nginx_lists = sel.xpath('//a/text()').extract() items = [] for nginx in nginx_lists: if nginx == '../' or nginx == 'tmp/': continue item = OpensourceSpiderItem() item['orginname'] = nginx item['downurl'] = response.url + nginx item['filesize'] = 0 items.append(item) return items
def parse(self, response): sel = Selector(response) items = [] lvs_lists = sel.xpath('//a/@href').extract() for v in lvs_lists: if v == '../' or v == 'tmp/' or v == 'Name' or v == 'Last modified' or v == 'Description' or v == 'Parent Directory' or v == 'ChangeLog' or v == 'Size': continue item = OpensourceSpiderItem() item['orginname'] = v item['downurl'] = response.url + v item['filesize'] = 0 items.append(item) return items
def parse(self, response): #print "text = " , response.text #sel = Selector(response) items = [] pt1 = re.compile( r'<a\shref="([\d\w.-]*)">(.*)</a>(\s*)([\d\w-]*)(\s*)([\d:]*)(\s*)' ) ret1 = re.findall(pt1, response.text) for v in ret1: item = OpensourceSpiderItem() item['orginname'] = v[0] item['downurl'] = response.url + v[0] item['filesize'] = 0 items.append(item) return items
def parse(self, response): #print "text = " , response.text #sel = Selector(response) downloadurl = 'http://sphinxsearch.com/files/' items = [] pt1 = re.compile( r'<a\shref="/downloads/([\d\w.-]*)/thankyou.html"\sonmouseover=') ret1 = re.findall(pt1, response.text) for v in ret1: item = OpensourceSpiderItem() item['orginname'] = v item['downurl'] = downloadurl + v item['filesize'] = 0 items.append(item) return items
def parse(self, response): #print "text = " , response.text #sel = Selector(response) pt1 = re.compile(r'<a\shref="(.*?)">(.*?)</a>([\s]*)([\d\w-]*)\s([\d:]*)([\s]*)([-\d\w]*)([\s]*)') ret1 = re.findall(pt1, response.text) for v in ret1: if v[6] == '-': new_url = response.url + v[0] yield Request(new_url, callback = self.parse) elif v[6] == '': continue else: item = OpensourceSpiderItem() item['orginname'] = v[0] item['downurl'] = response.url + v[0] item['filesize'] = 0 yield Request(response.url + "?t=" + str(uuid.uuid1()), meta = {'item' : item}, callback = self.parse_item)
def parse(self, response): #print "text = " , response.text #sel = Selector(response) pt1 = re.compile( r'<img\ssrc="(.*?)"\salt="\[(.*?)\]">\s<a\shref="(.*)?">(.*?)</a>') ret1 = re.findall(pt1, response.text) for v in ret1: if v[1] == 'DIR': new_url = response.url + v[2] yield Request(new_url, callback=self.parse) elif v[1] == 'PARENTDIR': continue else: item = OpensourceSpiderItem() item['orginname'] = v[2] item['downurl'] = response.url + v[2] item['filesize'] = 0 yield Request(response.url + "?t=" + str(uuid.uuid1()), meta={'item': item}, callback=self.parse_item)
def parse(self, response): #self.log("url = %s"%response.url, level = log.INFO) pt1 = re.compile( r'<img\ssrc="(.*?)"\salt="\[(.*?)\]">([\s]*?)<a\shref="(.*)?">(.*?)</a>([\s]*)([\d-]*)([\s]*)([\d:]*)([\s]*)([\d\w-]*)([\s]*)' ) ret1 = re.findall(pt1, response.text) for v in ret1: #if v[3] != 'camel/' and v[3] != 'apache-camel/' and v[3] != 'apache-camel-2.8.5.tar.gz' and v[3] != '2.8.5/': #continue if v[1] == 'DIR': new_url = response.url + v[3] yield Request(new_url, callback=self.parse) elif v[1] == 'PARENTDIR': continue else: item = OpensourceSpiderItem() item['orginname'] = v[3] item['downurl'] = response.url + v[3] item['filesize'] = 0 yield Request(response.url + '?t=' + str(uuid.uuid1()), meta={'item': item}, callback=self.parse_item)