class Pansoso2Spider(scrapy.Spider): u""" 盘搜搜爬虫02 - 输入页面提取下载信息 """ name = "pansoso2" allowed_domains = ["www.pansoso.com"] settings = get_project_settings() level2_file = settings.get('LEVEL2_FILE') level3_file = settings.get('LEVEL3_FILE') rm_file(level3_file) start_urls = read_file(level2_file) def parse(self, response): if response.status == 200: selector = scrapy.Selector(response) # <div class="down"> infos = selector.xpath('//div[@class="down"]') level3_urls = [] for info in infos: hrefs = info.xpath('a/@href').extract() hrefs = [i for i in hrefs if '.html' not in i] href = hrefs[0] level3_urls.append(href) write_file(self.level3_file, level3_urls, mode='append')
def __init__(self, *args, **kwargs): super(PansosoSpider3, self).__init__(*args, **kwargs) self.level3_file = settings.get('LEVEL3_FILE') self.level4_file = settings.get('LEVEL4_FILE') rm_file(self.level4_file) self.start_urls = read_file(self.level3_file)
class PansosoSpiderThread(scrapy.Spider): u""" 盘搜搜爬虫03 - 提取百度云链接 """ name = "pansoso3" allowed_domains = ["www.pansoso.com"] settings = get_project_settings() level3_file = settings.get('LEVEL3_FILE') level4_file = settings.get('LEVEL4_FILE') rm_file(level4_file) start_urls = read_file(level3_file) def parse(self, response): time.sleep(0.5) if response.status == 200: selector = scrapy.Selector(response) infos = selector.xpath('//div[@class="file"]') level4_urls = [] for info in infos: href = info.xpath('p/a/@href').extract()[0] print(href) level4_urls.append(href) write_file(self.level4_file, level4_urls, mode='append')
def __init__(self, mode='append', *args, **kwargs): super(DashengpanSpider2, self).__init__(*args, **kwargs) self.mode = mode self.level2_file = settings.get('LEVEL2_FILE') self.level3_file = settings.get('LEVEL3_FILE') if self.mode == 'override': rm_file(self.level3_file) self.start_urls = read_file(self.level2_file)
def __init__(self, *args, **kwargs): super(PansosoSpider2, self).__init__(*args, **kwargs) self.level2_file = settings.get('LEVEL2_FILE') self.level3_file = settings.get('LEVEL3_FILE') rm_file(self.level3_file) self.start_urls = read_file(self.level2_file) print('>>>>>>>>>>>>> 2') print(dir(self)) print(dir(self.start_requests)) print(self.start_requests)
def __init__(self, mode='append', *args, **kwargs): super(DashengpanSpider2, self).__init__(*args, **kwargs) self.mode = mode self.level2_file = settings.get('LEVEL2_FILE') self.result_file = settings.get('RESULT_FILE') if self.mode == 'override': rm_file(self.result_file) self.start_urls = read_file(self.level2_file) self.start_urls = [i for i in self.start_urls if i.startswith('http')] if settings.get("IS_USE_DELAY_LOAD_URL", False): # 延迟加载 self.browser = webdriver.Chrome() self.browser.set_page_load_timeout(30)