def parse_step_1(self, response): if response.status == 200 or response.status == 301: if response.url == 'http://www.59pic.com/waf_verify.htm': return max_num = response.css("#pageNum a.a1::text").extract_first( default="1").lstrip(u'共').rstrip(u'张') type = u'美女大全' title = response.css("title::text").extract_first().split('-')[0] img_path = response.css("#bigImg::attr(src)").extract_first( default='') bulk_sqls = [] bulk_sqls.append( "('%s', '%s', '%s', '%s', '%s', 0)" % (self._source, type, title, response.url, img_path)) try: for num in range(2, int(max_num) + 1): page_url = response.url.rstrip('.html') + '_' + str( num) + '.html' bulk_sqls.append("('%s', '%s', '%s', '%s', '', 0)" % (self._source, type, title, page_url)) except: print '解释失败' _execute( "insert into mzitu_all(source, type, title, page_url, img_path, state) values" + ",".join(bulk_sqls)) pass
def parse_step_1(self, response): if response.status == 200 or response.status == 301: max_num = response.css( "#page > a:nth-last-child(3)::text").extract_first(default="1") type = u'妹子图' title = response.css(".main > .article > h2::text").extract_first( default='NAN') img_path = response.css( "#content > a > img::attr(src)").extract_first(default='') bulk_sqls = [] bulk_sqls.append( "('%s', '%s', '%s', '%s', '%s', 0)" % (self._source, type, title, response.url, img_path)) try: for num in range(2, int(max_num) + 1): page_url = response.url + '/' + str(num) bulk_sqls.append("('%s', '%s', '%s', '%s', '', 0)" % (self._source, type, title, page_url)) except: print '解释失败' _execute( "insert into mzitu_all(source, type, title, page_url, img_path, state) values" + ",".join(bulk_sqls)) pass
def parse_item(self, response): params = response.url.split('/') if params[-3] == 'fuli' and params[-1] == '': return type = response.css('#xiaohua_list .left .bt .l > a:nth-child(3)::text' ).extract_first(default='NAN') title = response.css("#xiaohua_list .left .cont > h1::text" ).extract_first(default='NAN') if title != 'NAN': try: title = title[0:title.rfind('(') - 1] except: pass img_path = response.css("#xiaohua_list .left .content img::attr(src)" ).extract_first(default='') if 'NAN' == title and img_path == '': return sql = "insert into mzitu_all(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \ (self._source, type, title, response.url, img_path) _execute(sql) pass
def parse_step_2(self, response): if response.status == 200 or response.status == 301: img_path = response.css("#bigImg::attr(src)").extract_first( default='') if "" != img_path: _execute( "update mzitu_all set img_path = '%s' where page_url = '%s'" % (img_path, response.url)) pass
def parse_item(self, response): type = response.css('.warp .articleV4Info > a::text').extract_first(default=u'美女图片') title = response.css("#picBody img::attr(alt)").extract_first(default='NAN') img_path = response.css("#picBody img::attr(src)").extract_first(default='') if 'NAN' == title and img_path == '': return sql = "insert into mzitu_youzi(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \ (self._source, type, title, response.url, img_path) _execute(sql) pass
def parse_item(self, response): if response.url.find('list_11') != -1: return type = u'美女图片' title = response.css( "#picBody > p > a > img::attr(alt)").extract_first(default='NAN') img_path = response.css( "#picBody > p > a > img::attr(src)").extract_first(default='') if 'NAN' == title and img_path == '': return sql = "insert into mzitu_all(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \ (self._source, type, title, response.url, img_path) _execute(sql) pass
def parse_item(self, response): if response.url.find('index_') != -1: return type = u'妹子图' title = response.css(".indexbox .r .tcontent img::attr(alt)" ).extract_first(default='NAN') img_path = response.css( ".indexbox .r .tcontent img::attr(src)").extract_first(default='') if 'NAN' == title and img_path == '': return sql = "insert into mzitu_all(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \ (self._source, type, title, response.url, img_path) _execute(sql) pass
def parse_item(self, response): if response.url.find('list_9') != -1: return type = response.css('.photo .topmbx > a:nth-child(3)::text' ).extract_first(default=u'美女图片') title = response.css("#big-pic img::attr(alt)").extract_first( default='NAN') img_path = response.css("#big-pic img::attr(src)").extract_first( default='') if 'NAN' == title and img_path == '': return sql = "insert into mzitu_all(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \ (self._source, type, title, response.url, img_path) _execute(sql) pass
def parse_item(self, response): if response.url.find('list_') != -1: return params = response.url.split('.') if params[-1] != 'html': return type = response.css('.warp.mar .warp.oh .articleV4Info > a::text').extract_first(default=u'妹子图') title = response.css("#picBody img::attr(alt)").extract_first(default='NAN') img_path = response.css("#picBody img::attr(src)").extract_first(default='') if 'NAN' == title and img_path == '': return sql = "insert into mzitu_all(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \ (self._source, type, title, response.url, img_path) _execute(sql) pass
def parse_item(self, response): type = response.css('.place > a:nth-child(2)::text').extract_first( default='MM131ÃÀŮͼƬ') title = response.css(".content .content-pic img::attr(alt)" ).extract_first(default='NAN') img_path = response.css( ".content .content-pic img::attr(src)").extract_first(default='') try: title = title[0:title.rindex(u'(')] except: title = title if 'NAN' == title and img_path == '': return sql = "insert into mzitu_mm131(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \ (self._source, type, title, response.url, img_path) _execute(sql) pass
def parse_item(self, response): params = response.url.split('/') if params[-2] == 'archives': type = response.css('section > .content-wrap span.item-3 > a::text' ).extract_first(default='NAN') title = response.css( "section > .content-wrap header.article-header > h1.article-title::text" ).extract_first(default='NAN') bulk_sqls = [] for img_path in response.css( 'section > .content-wrap article.article-content img::attr(src)' ).extract(): bulk_sqls.append( "('%s', '%s', '%s', '%s', '%s', 0)" % (self._source, type, title, response.url, img_path)) _execute( "insert into mzitu_all(source, type, title, page_url, img_path, state) values" + ",".join(bulk_sqls)) pass
def parse_item(self, response): type = response.css( '.photo .tsmaincont-desc > span:nth-child(3) > a::text' ).extract_first(default=u'美女图片') title = response.css("#big-pic img::attr(alt)").extract_first( default='NAN') img_path = response.css("#big-pic img::attr(src)").extract_first( default='') if 'NAN' == title and img_path == '': return try: title = re.sub(r'第\d+张', '', title) except: title = title sql = "insert into mzitu_aitaotu(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \ (self._source, type, title, response.url, img_path) _execute(sql) pass
def parse_item(self, response): if response.url.find('list') == -1: return type = u'搂妹子' title = response.css("title::text").extract_first(default='NAN') try: if title[-1] == ')': r = title.rindex('(') title = title[0:r] except: pass img_path = response.css("#content > img::attr(src)").extract_first( default='') if 'NAN' == title and img_path == '': return sql = "insert into mzitu_all(source, type, title, page_url, img_path, state) values('%s', '%s', '%s', '%s', '%s', 0)" % \ (self._source, type, title, response.url, img_path) _execute(sql) pass
def item_completed(self, results, item, info): ok_paths = [] err_paths = [] for i in range(0, len(results)): if results[i][0]: ok_paths.append(item['image_paths'][i]) else: err_paths.append(item['image_paths'][i]) # image_paths = [x['path'] for ok, x in results if ok] # download_image_paths = [x['url'] for ok, x in results if ok] # if not image_paths: # raise DropItem("Item contains no images") if item['source'] == 'mm29': if len(ok_paths) > 0: _ok_paths = [] for path in ok_paths: _ok_paths.append(path + '/800.jpg') _execute( "update mzitu_aitaotu_n set state = 1 where img_path in ('%s')" % ("','".join(_ok_paths))) if len(err_paths): _err_paths = [] for path in err_paths: _err_paths.append(path + '/800.jpg') _execute( "update mzitu_aitaotu_n set state = 9 where img_path in ('%s')" % ("','".join(_err_paths))) else: if len(ok_paths) > 0: _execute( "update mzitu_aitaotu_n set state = 1 where img_path in ('%s')" % ("','".join(ok_paths))) if len(err_paths) > 0: _execute( "update mzitu_aitaotu_n set state = 9 where img_path in ('%s')" % ("','".join(err_paths))) return item