def parse(self, response): sel = Selector(response) title = sel.xpath('//h2/text()').extract()[0] title = "%s-%s" % (title, self.name) title = self.polishString(title) print(title) tmpNovelDirPath = os.path.join(self.tmpDirPath, title) if (os.path.isdir(tmpNovelDirPath) != True): os.makedirs(tmpNovelDirPath) dd = sel.xpath('//li/a') id = 0 for d in dd: id += 1 url = d.xpath('@href').extract()[0] url = response.urljoin(url) subtitle = d.xpath('text()').extract()[0] subtitle = self.polishString(subtitle) subtitle = '\n\n********* [%d] - %s *********\n\n' % (id, subtitle) print(url) print(subtitle) request = scrapy.Request(url, callback=self.parse_page) item = NovelsItem() item['title'] = title item['subtitle'] = subtitle item['id'] = id item['type'] = 'novels' request.meta['item'] = item yield request
def parse(self, response): sel = Selector(response) title = sel.xpath('//h1/text()').extract()[0] title = polishTitle(title, self.name) print(title) tmpNovelDirPath = os.path.join(self.tmpDirPath, title) if (os.path.isdir(tmpNovelDirPath) != True): os.makedirs(tmpNovelDirPath) dd = sel.xpath('//dd/a') pages = polishPages(title, len(dd)) for i in pages: if (i % 3 == 0): index = i - 2 elif (i % 3 == 1): index = i + 2 else: index = i d = dd[index - 1] url = d.xpath('@href').extract()[0] url = response.urljoin(url.strip()) subtitle = d.xpath('text()').extract()[0] subtitle = polishSubtitle(subtitle) print(url) print(subtitle) request = scrapy.Request(url, callback=self.parse_page) item = NovelsItem() item['title'] = title item['subtitle'] = subtitle item['id'] = i item['type'] = 'novels' request.meta['item'] = item yield request
def parse(self, response): sel = Selector(response); title = sel.xpath('//meta[@property="og:title"]/@content').extract()[0]; title = polishTitle(title, self.name); print(title) tmpNovelDirPath = os.path.join(self.tmpDirPath, title); if(os.path.isdir(tmpNovelDirPath) != True): os.makedirs(tmpNovelDirPath); dd = sel.xpath('//li/a'); pages = polishPages(title, len(dd)); for i in pages: d = dd[i-1]; url = d.xpath('@href').extract()[0]; url = response.urljoin(url.strip()); subtitle = d.xpath('text()').extract()[0]; subtitle = polishSubtitle(subtitle); print(url); print(subtitle); request = scrapy.Request(url, callback = self.parse_page); item = NovelsItem(); item['title'] = title; item['subtitle'] = subtitle; item['id'] = i; item['type'] = 'novels'; request.meta['item'] = item; yield request;
def parse(self, response): sel = Selector(response) title = sel.xpath('//h1/a/text()').extract()[0] title = "%s-%s" % (title, self.name) title = self.polishString(title) print(title) tmpNovelDirPath = os.path.join(self.tmpDirPath, title) if (os.path.isdir(tmpNovelDirPath) != True): os.makedirs(tmpNovelDirPath) dd = sel.xpath('//div[@align="left"]/a') id = 0 for d in dd: id += 1 url = d.xpath('@href').extract()[0] url = response.urljoin(url.strip()) subtitle = d.xpath('text()').extract()[0] subtitle = self.polishString(subtitle) subtitle = '\n\n********* ' + subtitle + ' *********\n\n' print(url) print(subtitle) request = scrapy.Request(url, callback=self.parse_page) item = NovelsItem() item['title'] = title item['subtitle'] = subtitle item['id'] = id item['type'] = 'novels' request.meta['item'] = item if (self.isFileExist(title, id) == False): yield request else: pass
def parse(self, response): sel = Selector(response) title = sel.xpath('//h1/text()').extract()[0] title = polishTitle(title, self.name) print(title) tmpNovelDirPath = os.path.join(self.tmpDirPath, title) if (os.path.isdir(tmpNovelDirPath) != True): os.makedirs(tmpNovelDirPath) dd = sel.xpath('//dl/dd') id = 0 for d in dd: id += 1 nid = ((id - 1) / 3 + 1) * 3 - (id - 1) % 3 a = d.xpath('a') if (len(a) == 0): continue url = a.xpath('@href').extract()[0] url = response.urljoin(url.strip()) subtitle = a.xpath('text()').extract()[0] subtitle = polishSubtitle(subtitle) print(url) print(subtitle) request = scrapy.Request(url, callback=self.parse_page) item = NovelsItem() item['title'] = title item['subtitle'] = subtitle item['id'] = nid item['type'] = 'novels' request.meta['item'] = item yield request
def parse(self, response): sel = Selector(response) ss = sel.xpath('//title/text()').extract()[0] pattern = re.compile(u'([^全文阅读]*)全文阅读') title = re.match(pattern, ss).group(1) title = "%s-%s" % (title, self.name) title = self.polishString(title) print(title) tmpNovelDirPath = os.path.join(self.tmpDirPath, title) if (os.path.isdir(tmpNovelDirPath) != True): os.makedirs(tmpNovelDirPath) dd = sel.xpath('//ul[@class="chapterlist"]/li/a') id = 0 for d in dd: id += 1 url = d.xpath('@href').extract()[0] url = response.urljoin(url) subtitle = d.xpath('text()').extract()[0] subtitle = '\n\n********* [%d] - %s *********\n\n' % (id, subtitle) print(url) print(subtitle) request = scrapy.Request(url, callback=self.parse_page) item = NovelsItem() item['title'] = title item['subtitle'] = subtitle item['id'] = id item['type'] = 'novels' request.meta['item'] = item yield request
def parse(self, response): sel = Selector(response) title = sel.xpath('//meta[@name="keywords"]/@content').extract()[0] title = polishTitle(title, self.name) print(title) tmpNovelDirPath = os.path.join(self.tmpDirPath, title) if (os.path.isdir(tmpNovelDirPath) != True): os.makedirs(tmpNovelDirPath) tables = sel.response.xpath('//table[@class="m10"]') table = tables[1] dd = table.xpath('tr/td/div/a') pages = polishPages(title, len(dd)) for i in pages: d = dd[i - 1] url = d.xpath('@href').extract()[0] url = response.urljoin(url.strip()) subtitle = d.xpath('text()').extract()[0] subtitle = polishSubtitle(subtitle) print(url) print(subtitle) request = scrapy.Request(url, callback=self.parse_page) item = NovelsItem() item['title'] = title item['subtitle'] = subtitle item['id'] = i item['type'] = 'novels' request.meta['item'] = item yield request
def parse(self, response): sel = Selector(response); title = sel.xpath('//h1/text()').extract()[0] title = "%s-%s"%(title, self.name); title = self.polishString(title); print(title) tmpNovelDirPath = os.path.join(self.tmpDirPath, title); if(os.path.isdir(tmpNovelDirPath) != True): os.makedirs(tmpNovelDirPath); dd = sel.xpath('//div[@class="list"]/ul/li/a'); id = 0; for d in dd: id += 1; url = d.xpath('@href').extract()[0]; url = response.urljoin(url); subtitle = u"第%d章"%id; subtitle = self.polishString(subtitle); subtitle = '\n\n********* [%d] - %s *********\n\n'% (id, subtitle); print(url); print(subtitle); request = scrapy.Request(url, callback = self.parse_page); item = NovelsItem(); item['title'] = title; item['subtitle'] = subtitle; item['id'] = id; item['type'] = 'novels'; request.meta['item'] = item; yield request;
def parse(self, response): sel = Selector(response) title = sel.xpath('//h3/a/text()').extract()[0] title = polishTitle(title, self.name) print(title) tmpNovelDirPath = os.path.join(self.tmpDirPath, title) if (os.path.isdir(tmpNovelDirPath) != True): os.makedirs(tmpNovelDirPath) ul = sel.xpath('//div[@class="List2013"]/ul')[-1] dd = ul.xpath('li/a') pages = polishPages(title, len(dd)) for i in pages: d = dd[i - 1] url = d.xpath('@href').extract()[0] url = response.urljoin(url.strip()) subtitle = d.xpath('text()').extract()[0] subtitle = polishSubtitle(subtitle) print(url) print(subtitle) request = scrapy.Request(url, callback=self.parse_page) item = NovelsItem() item['title'] = title item['subtitle'] = subtitle item['id'] = i item['type'] = 'novels' request.meta['item'] = item yield request
def parse(self, response): sel = Selector(response) ss = sel.xpath('//h1/text()').extract()[0] pattern = re.compile(u'《([^》]*)》') title = re.match(pattern, ss).group(1) title = polishTitle(title, self.name) print(title) pattern = re.compile(u'.*:([^:]*)$') tt = re.match(pattern, ss).group(1) author = "%s" % tt #self.updateJSON(response.url, title, author); tmpNovelDirPath = os.path.join(self.tmpDirPath, title) if (os.path.isdir(tmpNovelDirPath) != True): os.makedirs(tmpNovelDirPath) # Subtitle subtitle = '' # Get the last page number lastUrl = sel.xpath('//div[@id="webPage"]/a/@href').extract()[-1] pattern = re.compile(u'[^-]*-(\d+)/') maxPage = int(re.match(pattern, lastUrl).group(1)) # Get the url prefix pattern = re.compile(u'([^-]*)') m = re.match(pattern, response.url) pageUrlPrefix = "%s-" % (m.group(0)) for id in range(1, maxPage + 1): url = "%s%d" % (pageUrlPrefix, id) request = scrapy.Request(url, callback=self.parse_page) item = NovelsItem() item['title'] = title item['subtitle'] = subtitle item['id'] = id item['type'] = 'novels' request.meta['item'] = item yield request