Python NovelsItem示例，comics.items.NovelsItem Python示例

示例#1

0

显示文件

    def parse(self, response):
        sel = Selector(response)
        title = sel.xpath('//h2/text()').extract()[0]
        title = "%s-%s" % (title, self.name)
        title = self.polishString(title)
        print(title)
        tmpNovelDirPath = os.path.join(self.tmpDirPath, title)
        if (os.path.isdir(tmpNovelDirPath) != True):
            os.makedirs(tmpNovelDirPath)

        dd = sel.xpath('//li/a')
        id = 0
        for d in dd:
            id += 1
            url = d.xpath('@href').extract()[0]
            url = response.urljoin(url)
            subtitle = d.xpath('text()').extract()[0]
            subtitle = self.polishString(subtitle)
            subtitle = '\n\n*********   [%d] - %s   *********\n\n' % (id,
                                                                      subtitle)
            print(url)
            print(subtitle)
            request = scrapy.Request(url, callback=self.parse_page)
            item = NovelsItem()
            item['title'] = title
            item['subtitle'] = subtitle
            item['id'] = id
            item['type'] = 'novels'
            request.meta['item'] = item
            yield request

示例#2

0

显示文件

    def parse(self, response):
        sel = Selector(response)
        title = sel.xpath('//h1/text()').extract()[0]
        title = polishTitle(title, self.name)
        print(title)
        tmpNovelDirPath = os.path.join(self.tmpDirPath, title)
        if (os.path.isdir(tmpNovelDirPath) != True):
            os.makedirs(tmpNovelDirPath)

        dd = sel.xpath('//dd/a')
        pages = polishPages(title, len(dd))
        for i in pages:
            if (i % 3 == 0):
                index = i - 2
            elif (i % 3 == 1):
                index = i + 2
            else:
                index = i
            d = dd[index - 1]
            url = d.xpath('@href').extract()[0]
            url = response.urljoin(url.strip())
            subtitle = d.xpath('text()').extract()[0]
            subtitle = polishSubtitle(subtitle)
            print(url)
            print(subtitle)
            request = scrapy.Request(url, callback=self.parse_page)
            item = NovelsItem()
            item['title'] = title
            item['subtitle'] = subtitle
            item['id'] = i
            item['type'] = 'novels'
            request.meta['item'] = item
            yield request

示例#3

0

显示文件

文件： qbxs8.py 项目： yytang2012/comics-novels-crawler

 def parse(self, response):
     sel = Selector(response);
     title = sel.xpath('//meta[@property="og:title"]/@content').extract()[0];
     title = polishTitle(title, self.name);
     print(title)
     tmpNovelDirPath = os.path.join(self.tmpDirPath, title);
     if(os.path.isdir(tmpNovelDirPath) != True):
         os.makedirs(tmpNovelDirPath);
     
     dd = sel.xpath('//li/a');
     pages = polishPages(title, len(dd));
     for i in pages:
         d = dd[i-1];
         url = d.xpath('@href').extract()[0];
         url = response.urljoin(url.strip());
         subtitle = d.xpath('text()').extract()[0];
         subtitle = polishSubtitle(subtitle);
         print(url);
         print(subtitle);
         request = scrapy.Request(url, callback = self.parse_page);
         item = NovelsItem();
         item['title'] = title;
         item['subtitle'] = subtitle;
         item['id'] = i;
         item['type'] = 'novels';
         request.meta['item'] = item;
         yield request;

示例#4

0

显示文件

    def parse(self, response):
        sel = Selector(response)
        title = sel.xpath('//h1/a/text()').extract()[0]
        title = "%s-%s" % (title, self.name)
        title = self.polishString(title)
        print(title)
        tmpNovelDirPath = os.path.join(self.tmpDirPath, title)
        if (os.path.isdir(tmpNovelDirPath) != True):
            os.makedirs(tmpNovelDirPath)

        dd = sel.xpath('//div[@align="left"]/a')
        id = 0
        for d in dd:
            id += 1
            url = d.xpath('@href').extract()[0]
            url = response.urljoin(url.strip())
            subtitle = d.xpath('text()').extract()[0]
            subtitle = self.polishString(subtitle)
            subtitle = '\n\n*********   ' + subtitle + '   *********\n\n'
            print(url)
            print(subtitle)
            request = scrapy.Request(url, callback=self.parse_page)
            item = NovelsItem()
            item['title'] = title
            item['subtitle'] = subtitle
            item['id'] = id
            item['type'] = 'novels'
            request.meta['item'] = item
            if (self.isFileExist(title, id) == False):
                yield request
            else:
                pass

示例#5

0

显示文件

    def parse(self, response):
        sel = Selector(response)
        title = sel.xpath('//h1/text()').extract()[0]
        title = polishTitle(title, self.name)
        print(title)
        tmpNovelDirPath = os.path.join(self.tmpDirPath, title)
        if (os.path.isdir(tmpNovelDirPath) != True):
            os.makedirs(tmpNovelDirPath)

        dd = sel.xpath('//dl/dd')
        id = 0
        for d in dd:
            id += 1
            nid = ((id - 1) / 3 + 1) * 3 - (id - 1) % 3
            a = d.xpath('a')
            if (len(a) == 0):
                continue
            url = a.xpath('@href').extract()[0]
            url = response.urljoin(url.strip())
            subtitle = a.xpath('text()').extract()[0]
            subtitle = polishSubtitle(subtitle)
            print(url)
            print(subtitle)
            request = scrapy.Request(url, callback=self.parse_page)
            item = NovelsItem()
            item['title'] = title
            item['subtitle'] = subtitle
            item['id'] = nid
            item['type'] = 'novels'
            request.meta['item'] = item
            yield request

示例#6

0

显示文件

文件： lewen8.py 项目： yytang2012/comics-novels-crawler

    def parse(self, response):
        sel = Selector(response)
        ss = sel.xpath('//title/text()').extract()[0]
        pattern = re.compile(u'([^全文阅读]*)全文阅读')
        title = re.match(pattern, ss).group(1)
        title = "%s-%s" % (title, self.name)
        title = self.polishString(title)
        print(title)
        tmpNovelDirPath = os.path.join(self.tmpDirPath, title)
        if (os.path.isdir(tmpNovelDirPath) != True):
            os.makedirs(tmpNovelDirPath)

        dd = sel.xpath('//ul[@class="chapterlist"]/li/a')
        id = 0
        for d in dd:
            id += 1
            url = d.xpath('@href').extract()[0]
            url = response.urljoin(url)
            subtitle = d.xpath('text()').extract()[0]
            subtitle = '\n\n*********   [%d] - %s   *********\n\n' % (id,
                                                                      subtitle)
            print(url)
            print(subtitle)
            request = scrapy.Request(url, callback=self.parse_page)
            item = NovelsItem()
            item['title'] = title
            item['subtitle'] = subtitle
            item['id'] = id
            item['type'] = 'novels'
            request.meta['item'] = item
            yield request

示例#7

0

显示文件

文件： 8wxs.py 项目： yytang2012/comics-novels-crawler

 def parse(self, response):
     sel = Selector(response)
     title = sel.xpath('//meta[@name="keywords"]/@content').extract()[0]
     title = polishTitle(title, self.name)
     print(title)
     tmpNovelDirPath = os.path.join(self.tmpDirPath, title)
     if (os.path.isdir(tmpNovelDirPath) != True):
         os.makedirs(tmpNovelDirPath)
     tables = sel.response.xpath('//table[@class="m10"]')
     table = tables[1]
     dd = table.xpath('tr/td/div/a')
     pages = polishPages(title, len(dd))
     for i in pages:
         d = dd[i - 1]
         url = d.xpath('@href').extract()[0]
         url = response.urljoin(url.strip())
         subtitle = d.xpath('text()').extract()[0]
         subtitle = polishSubtitle(subtitle)
         print(url)
         print(subtitle)
         request = scrapy.Request(url, callback=self.parse_page)
         item = NovelsItem()
         item['title'] = title
         item['subtitle'] = subtitle
         item['id'] = i
         item['type'] = 'novels'
         request.meta['item'] = item
         yield request

示例#8

0

显示文件

 def parse(self, response):
     sel = Selector(response);
     title = sel.xpath('//h1/text()').extract()[0]
     title = "%s-%s"%(title, self.name);
     title = self.polishString(title);
     print(title)
     tmpNovelDirPath = os.path.join(self.tmpDirPath, title);
     if(os.path.isdir(tmpNovelDirPath) != True):
         os.makedirs(tmpNovelDirPath);
     
     dd = sel.xpath('//div[@class="list"]/ul/li/a');
     id = 0;        
     for d in dd:
         id += 1;
         url = d.xpath('@href').extract()[0];
         url = response.urljoin(url);
         subtitle = u"第%d章"%id;
         subtitle = self.polishString(subtitle);
         subtitle = '\n\n*********   [%d] - %s   *********\n\n'% (id, subtitle);
         print(url);
         print(subtitle);
         request = scrapy.Request(url, callback = self.parse_page);
         item = NovelsItem();
         item['title'] = title;
         item['subtitle'] = subtitle;
         item['id'] = id;
         item['type'] = 'novels';
         request.meta['item'] = item;
         yield request;

示例#9

0

显示文件

文件： dzxsw.py 项目： yytang2012/comics-novels-crawler

 def parse(self, response):
     sel = Selector(response)
     title = sel.xpath('//h3/a/text()').extract()[0]
     title = polishTitle(title, self.name)
     print(title)
     tmpNovelDirPath = os.path.join(self.tmpDirPath, title)
     if (os.path.isdir(tmpNovelDirPath) != True):
         os.makedirs(tmpNovelDirPath)
     ul = sel.xpath('//div[@class="List2013"]/ul')[-1]
     dd = ul.xpath('li/a')
     pages = polishPages(title, len(dd))
     for i in pages:
         d = dd[i - 1]
         url = d.xpath('@href').extract()[0]
         url = response.urljoin(url.strip())
         subtitle = d.xpath('text()').extract()[0]
         subtitle = polishSubtitle(subtitle)
         print(url)
         print(subtitle)
         request = scrapy.Request(url, callback=self.parse_page)
         item = NovelsItem()
         item['title'] = title
         item['subtitle'] = subtitle
         item['id'] = i
         item['type'] = 'novels'
         request.meta['item'] = item
         yield request

示例#10

0

显示文件

文件： stoSpider.py 项目： yytang2012/comics-novels-crawler

    def parse(self, response):
        sel = Selector(response)
        ss = sel.xpath('//h1/text()').extract()[0]
        pattern = re.compile(u'《([^》]*)》')
        title = re.match(pattern, ss).group(1)
        title = polishTitle(title, self.name)
        print(title)
        pattern = re.compile(u'.*：([^：]*)$')
        tt = re.match(pattern, ss).group(1)
        author = "%s" % tt
        #self.updateJSON(response.url, title, author);
        tmpNovelDirPath = os.path.join(self.tmpDirPath, title)
        if (os.path.isdir(tmpNovelDirPath) != True):
            os.makedirs(tmpNovelDirPath)

        # Subtitle
        subtitle = ''

        # Get the last page number
        lastUrl = sel.xpath('//div[@id="webPage"]/a/@href').extract()[-1]
        pattern = re.compile(u'[^-]*-(\d+)/')
        maxPage = int(re.match(pattern, lastUrl).group(1))

        # Get the url prefix
        pattern = re.compile(u'([^-]*)')
        m = re.match(pattern, response.url)
        pageUrlPrefix = "%s-" % (m.group(0))

        for id in range(1, maxPage + 1):
            url = "%s%d" % (pageUrlPrefix, id)
            request = scrapy.Request(url, callback=self.parse_page)
            item = NovelsItem()
            item['title'] = title
            item['subtitle'] = subtitle
            item['id'] = id
            item['type'] = 'novels'
            request.meta['item'] = item
            yield request