Exemplo n.º 1
0
    def parse_step_1(self, response):
        if response.status == 200 or response.status == 301:
            if response.url == 'http://www.59pic.com/waf_verify.htm':
                return

            max_num = response.css("#pageNum a.a1::text").extract_first(
                default="1").lstrip(u'共').rstrip(u'张')
            type = u'美女大全'
            title = response.css("title::text").extract_first().split('-')[0]
            img_path = response.css("#bigImg::attr(src)").extract_first(
                default='')

            bulk_sqls = []
            bulk_sqls.append(
                "('%s', '%s', '%s', '%s', '%s', 0)" %
                (self._source, type, title, response.url, img_path))
            try:
                for num in range(2, int(max_num) + 1):
                    page_url = response.url.rstrip('.html') + '_' + str(
                        num) + '.html'
                    bulk_sqls.append("('%s', '%s', '%s', '%s', '', 0)" %
                                     (self._source, type, title, page_url))
            except:
                print '解释失败'
            _execute(
                "insert into mzitu_all(source, type, title, page_url, img_path, state) values"
                + ",".join(bulk_sqls))
        pass
Exemplo n.º 2
0
    def parse_step_1(self, response):
        if response.status == 200 or response.status == 301:
            max_num = response.css(
                "#page > a:nth-last-child(3)::text").extract_first(default="1")
            type = u'妹子图'
            title = response.css(".main > .article > h2::text").extract_first(
                default='NAN')
            img_path = response.css(
                "#content > a > img::attr(src)").extract_first(default='')

            bulk_sqls = []
            bulk_sqls.append(
                "('%s', '%s', '%s', '%s', '%s', 0)" %
                (self._source, type, title, response.url, img_path))
            try:
                for num in range(2, int(max_num) + 1):
                    page_url = response.url + '/' + str(num)
                    bulk_sqls.append("('%s', '%s', '%s', '%s', '', 0)" %
                                     (self._source, type, title, page_url))
            except:
                print '解释失败'
            _execute(
                "insert into mzitu_all(source, type, title, page_url, img_path, state) values"
                + ",".join(bulk_sqls))
        pass
Exemplo n.º 3
0
    def parse_item(self, response):
        params = response.url.split('/')
        if params[-3] == 'fuli' and params[-1] == '':
            return

        type = response.css('#xiaohua_list .left .bt .l > a:nth-child(3)::text'
                            ).extract_first(default='NAN')
        title = response.css("#xiaohua_list .left .cont > h1::text"
                             ).extract_first(default='NAN')
        if title != 'NAN':
            try:
                title = title[0:title.rfind('(') - 1]
            except:
                pass
        img_path = response.css("#xiaohua_list .left .content img::attr(src)"
                                ).extract_first(default='')

        if 'NAN' == title and img_path == '':
            return

        sql = "insert into mzitu_all(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \
              (self._source, type, title, response.url, img_path)
        _execute(sql)

        pass
Exemplo n.º 4
0
    def parse_step_2(self, response):
        if response.status == 200 or response.status == 301:
            img_path = response.css("#bigImg::attr(src)").extract_first(
                default='')

            if "" != img_path:
                _execute(
                    "update mzitu_all set img_path = '%s' where page_url = '%s'"
                    % (img_path, response.url))
        pass
Exemplo n.º 5
0
    def parse_item(self, response):
        type = response.css('.warp .articleV4Info > a::text').extract_first(default=u'美女图片')
        title = response.css("#picBody  img::attr(alt)").extract_first(default='NAN')
        img_path = response.css("#picBody img::attr(src)").extract_first(default='')

        if 'NAN' == title and img_path == '':
            return

        sql = "insert into mzitu_youzi(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \
              (self._source, type, title, response.url, img_path)
        _execute(sql)

        pass
Exemplo n.º 6
0
    def parse_item(self, response):
        if response.url.find('list_11') != -1:
            return

        type = u'美女图片'
        title = response.css(
            "#picBody > p > a > img::attr(alt)").extract_first(default='NAN')
        img_path = response.css(
            "#picBody > p > a > img::attr(src)").extract_first(default='')

        if 'NAN' == title and img_path == '':
            return

        sql = "insert into mzitu_all(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \
              (self._source, type, title, response.url, img_path)
        _execute(sql)

        pass
Exemplo n.º 7
0
    def parse_item(self, response):
        if response.url.find('index_') != -1:
            return

        type = u'妹子图'
        title = response.css(".indexbox .r .tcontent img::attr(alt)"
                             ).extract_first(default='NAN')
        img_path = response.css(
            ".indexbox .r .tcontent img::attr(src)").extract_first(default='')

        if 'NAN' == title and img_path == '':
            return

        sql = "insert into mzitu_all(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \
              (self._source, type, title, response.url, img_path)
        _execute(sql)

        pass
Exemplo n.º 8
0
    def parse_item(self, response):
        if response.url.find('list_9') != -1:
            return

        type = response.css('.photo .topmbx > a:nth-child(3)::text'
                            ).extract_first(default=u'美女图片')
        title = response.css("#big-pic img::attr(alt)").extract_first(
            default='NAN')
        img_path = response.css("#big-pic img::attr(src)").extract_first(
            default='')

        if 'NAN' == title and img_path == '':
            return

        sql = "insert into mzitu_all(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \
              (self._source, type, title, response.url, img_path)
        _execute(sql)

        pass
Exemplo n.º 9
0
    def parse_item(self, response):
        if response.url.find('list_') != -1:
            return
        params = response.url.split('.')
        if params[-1] != 'html':
            return

        type = response.css('.warp.mar .warp.oh .articleV4Info > a::text').extract_first(default=u'妹子图')
        title = response.css("#picBody img::attr(alt)").extract_first(default='NAN')
        img_path = response.css("#picBody img::attr(src)").extract_first(default='')

        if 'NAN' == title and img_path == '':
            return

        sql = "insert into mzitu_all(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \
              (self._source, type, title, response.url, img_path)
        _execute(sql)

        pass
Exemplo n.º 10
0
    def parse_item(self, response):
        type = response.css('.place > a:nth-child(2)::text').extract_first(
            default='MM131ÃÀŮͼƬ')
        title = response.css(".content .content-pic img::attr(alt)"
                             ).extract_first(default='NAN')
        img_path = response.css(
            ".content .content-pic img::attr(src)").extract_first(default='')

        try:
            title = title[0:title.rindex(u'(')]
        except:
            title = title
        if 'NAN' == title and img_path == '':
            return

        sql = "insert into mzitu_mm131(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \
              (self._source, type, title, response.url, img_path)
        _execute(sql)

        pass
Exemplo n.º 11
0
    def parse_item(self, response):
        params = response.url.split('/')
        if params[-2] == 'archives':
            type = response.css('section > .content-wrap span.item-3 > a::text'
                                ).extract_first(default='NAN')
            title = response.css(
                "section > .content-wrap header.article-header > h1.article-title::text"
            ).extract_first(default='NAN')

            bulk_sqls = []
            for img_path in response.css(
                    'section > .content-wrap article.article-content img::attr(src)'
            ).extract():
                bulk_sqls.append(
                    "('%s', '%s', '%s', '%s', '%s', 0)" %
                    (self._source, type, title, response.url, img_path))

            _execute(
                "insert into mzitu_all(source, type, title, page_url, img_path, state) values"
                + ",".join(bulk_sqls))
            pass
Exemplo n.º 12
0
    def parse_item(self, response):
        type = response.css(
            '.photo .tsmaincont-desc > span:nth-child(3) > a::text'
        ).extract_first(default=u'美女图片')
        title = response.css("#big-pic  img::attr(alt)").extract_first(
            default='NAN')
        img_path = response.css("#big-pic img::attr(src)").extract_first(
            default='')

        if 'NAN' == title and img_path == '':
            return
        try:
            title = re.sub(r'第\d+张', '', title)
        except:
            title = title

        sql = "insert into mzitu_aitaotu(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \
              (self._source, type, title, response.url, img_path)
        _execute(sql)

        pass
Exemplo n.º 13
0
    def parse_item(self, response):
        if response.url.find('list') == -1:
            return

        type = u'搂妹子'
        title = response.css("title::text").extract_first(default='NAN')
        try:
            if title[-1] == ')':
                r = title.rindex('(')
                title = title[0:r]
        except:
            pass

        img_path = response.css("#content > img::attr(src)").extract_first(
            default='')

        if 'NAN' == title and img_path == '':
            return

        sql = "insert into mzitu_all(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \
              (self._source, type, title, response.url, img_path)
        _execute(sql)

        pass
Exemplo n.º 14
0
    def item_completed(self, results, item, info):
        ok_paths = []
        err_paths = []
        for i in range(0, len(results)):
            if results[i][0]:
                ok_paths.append(item['image_paths'][i])
            else:
                err_paths.append(item['image_paths'][i])
        # image_paths = [x['path'] for ok, x in results if ok]
        # download_image_paths = [x['url'] for ok, x in results if ok]
        # if not image_paths:
        #     raise DropItem("Item contains no images")

        if item['source'] == 'mm29':
            if len(ok_paths) > 0:
                _ok_paths = []
                for path in ok_paths:
                    _ok_paths.append(path + '/800.jpg')
                _execute(
                    "update mzitu_aitaotu_n set state = 1 where img_path in ('%s')"
                    % ("','".join(_ok_paths)))
            if len(err_paths):
                _err_paths = []
                for path in err_paths:
                    _err_paths.append(path + '/800.jpg')
                _execute(
                    "update mzitu_aitaotu_n set state = 9 where img_path in ('%s')"
                    % ("','".join(_err_paths)))
        else:
            if len(ok_paths) > 0:
                _execute(
                    "update mzitu_aitaotu_n set state = 1 where img_path in ('%s')"
                    % ("','".join(ok_paths)))
            if len(err_paths) > 0:
                _execute(
                    "update mzitu_aitaotu_n set state = 9 where img_path in ('%s')"
                    % ("','".join(err_paths)))
        return item