def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 55 name = response.css('body > div > div.row-middle2 > div.newcontainer > div.newstitle-box > div.newstitle::text').extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css('body > div > div.row-middle2 > div.newcontainer > div.newstxt > div:nth-child(1) > img::attr(src)').extract() if (len(pic) == 0): return base_url = "http://www.wzmuseum.cn" pic1 = str(pic[0]).strip() pic2 = "" for i in range(0, len(str(pic[0]).strip())): pic2 += pic1[i] url = base_url + pic2 Items['pic'] = url text = response.css('body > div > div.row-middle2 > div.newcontainer > div.newstxt > div:nth-child(4) *::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\u3000', '') Items['text'] = s.replace('\xa0', '') if Items['text'] == "": return yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 37 name = response.css( 'body > table:nth-child(3) > tr > td > table > tr > td:nth-child(2) > table > tr:nth-child(2) > td > div > table > tr > td > table > tr > td > table:nth-child(1) > tr:nth-child(2) > td > font > b::text' ).extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css( 'body > table:nth-child(3) > tr > td > table > tr > td:nth-child(2) > table > tr:nth-child(2) > td > div > table > tr > td > table > tr > td > table:nth-child(1) > tr:nth-child(3) > td > div > p > img::attr(src)' ).extract() if (len(pic) == 0): return base_url = "https://www.wmhg.com.cn" url = base_url + str(pic[0]).strip() Items['pic'] = url # text = response.css('body > div.x-container > div > div.section1 > div > div.slick-cont > div *::text').extract() # if (len(text) == 0): Items['text'] = "" # else: # s = "" # for item in text: # s += str(str(item).strip()).replace('\xa0', '') # Items['text'] = s # if Items['text'] == "": # return yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 78 name = response.css('#list > div.r > div > h4::text').extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css('img::attr(src)').extract() if (len(pic) == 0): return base_url = "" pic1 = str(pic[2]).strip() pic2 = "" for i in range(0, len(str(pic[2]).strip())): pic2 += pic1[i] url = base_url + pic2 Items['pic'] = url text = response.css( '#list > div.r > div > p:nth-child(4) *::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\u3000', '') Items['text'] = s.replace('\xa0', '') if Items['text'] == "": return yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 52 name = response.css('body > div.about > div > div.about_body > div > div.detail_head > div.detail_h::text').extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css('body > div.about > div > div.about_body > div > div.detail_text > a > img::attr(src)').extract() if (len(pic) == 0): return base_url = "http://www.chinasilkmuseum.com" pic1 = str(pic[0]).strip() pic2 = "" for i in range(0, len(str(pic[-1]).strip())): pic2 += pic1[i] url = base_url + pic2 Items['pic'] = url text = response.css('div.detail_text::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\u3000', '') Items['text'] = s.replace('\xa0','') if Items['text'] == "": return yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 75 name = response.css( 'body > div.n_collection_box > ul > div.n_collection_con_box > li.n_collection_con_box_t > span::text' ).extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css( 'body > div.n_collection_box > ul > div.p_solid > ul > li > img::attr(src)' ).extract() if (len(pic) == 0): return base_url = "http://www.hnzzmuseum.com" pic1 = str(pic[0]).strip() pic2 = "" for i in range(0, len(str(pic[0]).strip())): pic2 += pic1[i] url = base_url + pic2 Items['pic'] = url # text = response.css('body > div.n_collection_box > ul > div.n_collection_con_box > li.n_collection_con_box_c *::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\u3000', '') Items['text'] = s.replace('\xa0', '') if Items['text'] == "": return yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 49 name = response.css('#parametertitle::text').extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css( '#ban_pic1 > ul > li:nth-child(1) > img::attr(src)').extract() if (len(pic) == 0): return base_url = "http://www.njmuseumadmin.com" pic1 = str(pic[0]).strip() pic2 = "" for i in range(2, len(str(pic[-1]).strip())): pic2 += pic1[i] url = base_url + pic2 Items['pic'] = url text = response.css( '#DB_gallery > div.basicrightcon > div.gundongtiao > p::text' ).extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\xa0', '') Items['text'] = s if Items['text'] == "": return yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 32 name = response.css( 'body > div.x-container > div > div.t_bannar.r > div > div::text' ).extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css('div.img> img::attr(src)').extract() if (len(pic) == 0): return base_url = "https://www.wmhg.com.cn" url = base_url + str(pic[0]).strip() Items['pic'] = url text = response.css( 'body > div.x-container > div > div.section1 > div > div.slick-cont > div *::text' ).extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\xa0', '') Items['text'] = s if Items['text'] == "": return yield Items
def detailparse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 42 name = response.css( '#dnn_ctr504_ArticleDetails_ctl00_lblTitle::text').extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css( '#dnn_ctr504_ArticleDetails_ctl00_imgArticleImage::attr(src)' ).extract() if (len(pic) == 0): return base_url = "http://www.cyjng.net" url = base_url + str(pic[0]).strip() Items['pic'] = url text = response.css( '#dnn_ctr504_ArticleDetails_ctl00_lblArticle > p::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\xa0', '') Items['text'] = s yield Items
def detailparse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 64 name = response.css( 'body > div.innercont > div > div > div.maindetail > h1::text' ).extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() # pic = response.css('body > div.innercont > div > div > div.maindetail > div.cont > p:nth-child(1) > strong > img::attr(src)').extract() pic = response.css( 'body > div.innercont > div > div > div.maindetail > div.cont > p:nth-child(1) > img::attr(src)' ).extract() if (len(pic) == 0): return base_url = "" pic1 = str(pic[0]).strip() pic2 = "" for i in range(0, len(str(pic[0]).strip())): pic2 += pic1[i] url = base_url + pic2 Items['pic'] = url text = response.css( 'body > div.innercont > div > div > div.maindetail > div.cont *::text' ).extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\u3000', '') Items['text'] = s.replace('\xa0', '') yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 68 name = response.css('body > div.n_dc > div.wccp_n > div > h3::text').extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css('body > div.n_dc > div.wccp_n > div > div.wrapper > div > div.stage > div > ul > li:nth-child(1) > img::attr(src)').extract() if (len(pic) == 0): return base_url = "" pic1 = str(pic[0]).strip() pic2 = "" for i in range(0, len(str(pic[0]).strip())): pic2 += pic1[i] url = base_url + pic2 Items['pic'] = url text = response.css('body > div.n_dc > div.wccp_n > div > div.dc_nr > div > p::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\u3000', '') Items['text'] = s.replace('\xa0', '') if Items['text'] == "": return yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 32 name = response.css( '#collection-detail > div.cp-info-name::text').extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css('img::attr(src)').extract() if (len(pic) == 0): return base_url = "" pic1 = str(pic[-1]).strip() pic2 = "" for i in range(2, len(str(pic[-1]).strip())): pic2 += pic1[i] url = base_url + pic2 Items['pic'] = url text = response.css( '#collection-detail > div.cp-info-description::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\xa0', '') Items['text'] = s if Items['text'] == "": return yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 29 name = response.css( '#FrontProducts_detail02-0012_subm > div.content > div.pro-module > ul.basic > li::text' ).extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css( '#FrontProducts_detail02-0012_bigImg::attr(src)').extract() if (len(pic) == 0): return base_url = "http://www.lvshunmuseum.org" url = base_url + str(pic[0]).strip() Items['pic'] = url text = response.css( '#FrontProducts_detail02-0012_cont_1::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\xa0', '') Items['text'] = s yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 47 name = response.css( '#content_body > div.tresure_detail_head::text').extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css('#content_cover::attr(src)').extract() if (len(pic) == 0): return base_url = "https://www.yzmuseum.com" pic1 = str(pic[0]).strip() pic2 = "" for i in range(2, len(str(pic[-1]).strip())): pic2 += pic1[i] url = base_url + pic2 Items['pic'] = url text = response.css('#content_text *::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\xa0', '') Items['text'] = s if Items['text'] == "": return yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 67 name = response.css('#module12 > div.formMiddle.formMiddle12 > div.formMiddleCenter.formMiddleCenter12 > div > div > h1::text').extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css('#module12 > div.formMiddle.formMiddle12 > div.formMiddleCenter.formMiddleCenter12 > div > div > div.richContent.richContent0 > p > span > img::attr(src)').extract() if (len(pic) == 0): return base_url = "http:" pic1 = str(pic[0]).strip() pic2 = "" for i in range(0, len(str(pic[0]).strip())): pic2 += pic1[i] url = base_url + pic2 Items['pic'] = url text = response.css('#module12 > div.formMiddle.formMiddle12 > div.formMiddleCenter.formMiddleCenter12 > div > div > div.richContent.richContent0 *::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\u3000', '') Items['text'] = s.replace('\xa0', '') if Items['text'] == "": return yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 28 name = response.css( '#showcasescontent > div > div.ps_text > h1::text').extract() if (len(name) == 0): return Items['name'] = name[0] pic = response.css( '#showcasescontent > div > div.pictureshow > li > img::attr(src)' ).extract() if (len(pic) == 0): return base_url = "http://www.lvshunmuseum.org" url = base_url + str(pic[0]).strip() Items['pic'] = url text = response.css( '#showcasescontent > div > div.textshow > p::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\xa0', '') Items['text'] = s yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 35 name = response.css('#ContentPlaceHolder1_title::text').extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css( '#ContentPlaceHolder1_content > p > img::attr(src)').extract() if (len(pic) == 0): return base_url = "https://www.wmhg.com.cn" url = base_url + str(pic[0]).strip() Items['pic'] = url text = response.css( 'ContentPlaceHolder1_content > p *::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\xa0', '') Items['text'] = s yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 63 name = response.css('#top > div.subBody > div.OneOfTwo > div > h1::text').extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css('#top > div.subBody > div.OneOfTwo > div > div.showimg > a > img::attr(src)').extract() if (len(pic) == 0): return base_url = "http://www.jgsgmbwg.com" pic1 = str(pic[0]).strip() pic2 = "" for i in range(0, len(str(pic[0]).strip())): pic2 += pic1[i] url = base_url + pic2 Items['pic'] = url text = response.css('#textarea > span::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\u3000', '') Items['text'] = s.replace('\xa0', '') if Items['text'] == "": return yield Items
def detailparse(self, response): if(response.status=='404'): return Items=items.GginfoItem() Items['id']=response.meta['id'] Items['name'] = response.css('#ctl00_ContentPlaceHolder1_lb_Title::text').extract()[0] Items['pic']=response.css('#ctl00_ContentPlaceHolder1_lb_Content > img::attr(src)').extract()[0] Items['text']="无详细内容" yield Items
def parse(self, response): if(response.status=='404'): return Items = items.GginfoItem() Items['id']= response.meta['id'] Items['name']= response.css('body > div.x-container > div > div.col1 > div > div.r > div.t28::text').extract()[0] s="" for item in response.css('body > div.x-container > div > div.col1 > div > div.r *::text').extract(): s+=str(item).strip() picurl="www.gmc.org.cn" Items['text']=s Items['pic']="" yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = response.meta['id'] pic=[] pic=response.css('body > div.content_normal > div.content_shop > div.content_shopr > div.row.shopList.col_list > div > div > a > img::attr(src)').extract() name=[] name=response.css('body > div.content_normal > div.content_shop > div.content_shopr > div.row.shopList.col_list > div > div > a > img::attr(alt)').extract() for i in range(len(pic)): Items['text']="无" Items['name']=name[i] Items['pic']="http://www.bmnh.org.cn"+pic[i] yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = response.meta['id'] Items['name'] = response.css('h3 span::text').extract()[0] t = response.css('div.content_edit *::text').extract() s = "" picurl = "https://img.dpm.org.cn/" for i in range(len(t) - 1): s += str(t[i]).replace('\u3000', '') Items['text'] = s Items['pic'] = picurl + str( response.css('div.pic img::attr(src)').extract()[0]) yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = response.meta['id'] Items['name'] = response.css('body > div.main > div.main_r.r > div.main_content > div.content_title::text').extract()[0] t = response.css('body > div.main > div.main_r.r > div.main_content > div:nth-child(3) > div:nth-child(2) *::text').extract() s = "" picurl = "www.luxunmuseum.com.cn/" for item in t: item=str(item).strip() s+=item.replace('\u3000', '') Items['text'] = s Items['pic'] = picurl + str(response.css('body > div.main > div.main_r.r > div.main_content > div:nth-child(3) > div:nth-child(1) > img::attr(src)').extract()[0]) yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = response.meta['id'] name=response.css('#home > div > div > div > div.content > div.content-title > h3::text').extract() if len(name)==0: return Items['name'] =name[0] t = response.css('div.content_edit *::text').extract() s = response.css('#vsb_content > div > div > img::attr(src)').extract() if len(s)==0: s=response.css('#vsb_content > div > p > img::attr(src)').extract() if len(s)==0: return picurl = "http://www.balujun.cn" Items['text'] = "无详细介绍" Items['pic'] = picurl + str(s[0]) yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = response.meta['id'] Items['name'] = str(response.css('body > div > div.main > div > div > div.collD.clearfix > div.collD_con > div.collD_cc > div.collD_h > h3::text').extract()[0]).strip() t = response.css('body > div > div.main > div > div > div.collD.clearfix > div.collD_con > div.collD_cc > div.d_con *::text').extract() s = "" text=str(response.body_as_unicode()) pattern = re.compile('<img.*?src="(.*?)"'); for item in pattern.findall(text): print(item) # print(text) # picurl = "https://img.dpm.org.cn/" for i in range(len(t)): s += str(t[i]).strip() Items['text'] = s Items['pic'] = pattern.findall(text)[1] yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = response.meta['id'] Items['name'] = response.css( 'body > div.x-container > div.collection_xx2 > div > div.x-tit.center > div.h18::text' ).extract() t = response.css( 'body > div.x-container > div.collection_xx2 > div > div.padd > div.cont > div *::text' ).extract() s = "" picurl = "http://www.shanximuseum.com" text = str(response.body_as_unicode()) pattern = re.compile('<img.*?src="(.*?)"') for i in t: s += str(i).strip() Items['text'] = s Items['pic'] = picurl + pattern.findall(text)[3] yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = response.meta['id'] s = [] s = response.css( 'body > div > div > table > tr> td:nth-child(2)::text').extract() if len(s) == 0: return Items['name'] = s[0] t = response.css( 'body > div > div > table >tr:nth-child(3) > td:nth-child(2)::text' ).extract()[0] picurl = "http://www.pgm.org.cn" Items['text'] = str(t).strip() Items['pic'] = picurl + str( response.css( 'body > div > div > table > tr:nth-child(1) > td:nth-child(3) > img::attr(src)' ).extract()[0]) yield Items
def parse(self, response): if(response.status=='404'): return Items = items.GginfoItem() Items['id']= response.meta['id'] data=[] name=[] text=[] data=response.css('body > div.body-wrapper > div.content-wrapper > div > div.main-content > table > tr > td::text').extract() print(data) flag = True s1="" for i in range(len(data)) : if i<3: continue s=str(data[i]).strip() if(len(s)>0 and len(s)<=8): if(flag): flag = False else: text.append(s1) s1="" name.append(s) elif(len(s)>8): s1+=s for i in range(min(len(name),len(text))): print(name[i]) print(text[i]) picurl=[] picurl=response.css('body > div > div > div > div > table > tr > td > div > div > a::attr(href)').extract() exp=3 for i in range(-1+min(len(text),len(name),len(picurl))-1): Items['name']=name[i+1] Items['text']=text[i+1] Items['pic']="https://baike.baidu.com"+picurl[i+exp] if(i==4): exp=4 yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 26 name = response.css('body > table:nth-child(3) > tr:nth-child(2) > td > table > tr > td:nth-child(4) > table > tr:nth-child(3) > td > table > tr:nth-child(4) > td > p:nth-child(2)::text').extract() if len(name) == 0: return Items['name'] = str(str(name[0]).strip()).replace('\xa0', '') if(Items['name']==""): return t = response.css('body > table > tr > td > table > tr > td > table > tr > td > table > tr > td *::text').extract() # s = response.css('#dui > div > a::attr(src)').extract() text = str(response.body_as_unicode()) pattern = re.compile('<img.*?src="(.*?)"'); # if len(s) == 0: # s = response.css('body > table > tr > td > table > tr > td > table > tr > td > table > tr > td *::text').extract() # if len(s) == 0: # return picurl = "http://www.lnmuseum.com.cn" s="" if len(t)==0: Items['text'] = "无详细介绍" else: for i in range(27, len(t)-6): s+=str(str(t[i]).strip()).replace('\xa0', '') Items['text']=s s1=pattern.findall(text)[-4] s2="" for i in range(len(s1)): if(s1[i]!='/'): pass else: for j in range(i,len(s1)): s2+=s1[j] break Items['pic'] = picurl + s2 yield Items
def detailparse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 59 name = response.css( 'body > div.con.pad_t25 > div > div.xl > div.xl_top > h1::text' ).extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css('img::attr(src)').extract() if (len(pic) == 0): return base_url = response.url pos = 0 url = "" for i in range(len(base_url)): if base_url[i] == '/': pos = i for i in range(0, pos): url += base_url[i] pic1 = str(pic[-8]).strip() pic2 = "" for i in range(1, len(str(pic[-8]).strip())): pic2 += pic1[i] url += pic2 Items['pic'] = url text = response.css( '#fontzoom > div > div:nth-child(2) > font::text').extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\u3000', '') Items['text'] = s.replace('\xa0', '') yield Items
def parse(self, response): if (response.status == '404'): return Items = items.GginfoItem() Items['id'] = 54 name = response.css( '#form1 > div.ff > div.ffrg.rg > div.ffrg2 > div > div.biaoti::text' ).extract() if (len(name) == 0): return Items['name'] = str(name[0]).strip() pic = response.css( '#form1 > div.ff > div.ffrg.rg > div.ffrg2 > div > p:nth-child(3) > img::attr(src)' ).extract() if (len(pic) == 0): return base_url = "http://www.hzmuseum.com" pic1 = str(pic[0]).strip() pic2 = "" for i in range(2, len(str(pic[0]).strip())): pic2 += pic1[i] url = base_url + pic2 Items['pic'] = url text = response.css( '#form1 > div.ff > div.ffrg.rg > div.ffrg2 > div > p:nth-child(10) > span::text' ).extract() if (len(text) == 0): Items['text'] = "" else: s = "" for item in text: s += str(str(item).strip()).replace('\u3000', '') Items['text'] = s.replace('\xa0', '') if Items['text'] == "": return yield Items