def parse(self, response): #scrapy crawl collection145 url1 = 'http://www.wzmuseum.cn/Col/Col29/Index.aspx' url2 = 'http://www.wzmuseum.cn/Col/Col30/Index.aspx' url3 = 'http://www.wzmuseum.cn/Col/Col31/Index.aspx' url4 = 'http://www.wzmuseum.cn/Col/Col32/Index.aspx' url5 = 'http://www.wzmuseum.cn/Col/Col33/Index.aspx' url6 = 'http://www.wzmuseum.cn/Col/Col34/Index.aspx' url7 = 'http://www.wzmuseum.cn/Col/Col29/Index_2.aspx' urlll = ('1', url1, url2, url3, url4, url5, url6, url7) item = collectionItem() a = (6, 2, 1, 1) _list = response.xpath('/html/body/div[1]/div[4]/div[2]/div[2]/ul/li') for li in _list: coll_name = li.xpath('.//span/text()').extract_first() #detail_url='http://www.westlakemuseum.com'+li.xpath('./td/a/@href').extract_first() #coll_name=str.strip(coll_name) coll_img = li.xpath('./a/@href').extract_first() print(coll_name) print(coll_img) coll_desc = '' if self.page_num <= 6: new_url = (self.url % self.page_num) print(new_url) self.page_num += 1 yield scrapy.Request(new_url, callback=self.parse) else: if self.cnt <= 7: new_url = urlll[self.cnt] print(new_url) self.cnt += 1 yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): # //*[@id="building2"]/div/div[2]/table/tbody coll_list = response.xpath('/html/body/div') for li in coll_list: item = collectionItem() # if li.xpath('./td/a/text()').extract_first() != None: # //*[@id="227613"]/text() # coll_name = li.xpath('./td/a/text()').extract_first() # # coll_name = ''.join(coll_name) # print(coll_name) # print(li.xpath('./td/a/@href').extract_first()) img_new = li.xpath('./section/div[1]/div/a/img/@src').extract_first() img_new = img_new.replace(".",'',2) img = "http://www.19371213.com.cn/collection" + img_new print(img) url_new = li.xpath('./section/div[1]/div/a/@href').extract_first() url_new = url_new.replace(".",'',2) detail_url = "http://www.19371213.com.cn/collection" + url_new # detail_url = 'https://www.dpm.org.cn/' + li.xpath('./td/a/@href').extract_first() item['museumID']=11 item['collectionImage']=img yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item}) if self.page_num <= 17: new_url = (self.url%self.page_num) self.page_num += 1 yield scrapy.Request(new_url,callback=self.parse)
def parse(self, response): item = collectionItem() self.cnt += 1 if (self.cnt == 5): self.urll = 'http://www.gthyjng.com/gcww/wwjs/krzzsq/' if (self.cnt == 6): self.urll = 'http://www.gthyjng.com/gcww/wwjs/jfzzsq/' if (self.cnt == 7): self.urll = 'http://www.gthyjng.com/gcww/wwjs/gjdww/' urlll = ('1', '1', 'http://www.gthyjng.com/gcww/wwjs/tdgmsq/index_2.htm', 'http://www.gthyjng.com/gcww/wwjs/tdgmsq', 'http://www.gthyjng.com/gcww/wwjs/tdgmsq/index_3.htm', 'http://www.gthyjng.com/gcww/wwjs/krzzsq/', 'http://www.gthyjng.com/gcww/wwjs/jfzzsq/', 'http://www.gthyjng.com/gcww/wwjs/jfzzsq/index_1.htm', 'http://www.gthyjng.com/gcww/wwjs/gjdww/') x = response.xpath('/html/body/div[4]/div/div[2]/div[2]/ul/li') for li in x: l1 = li.xpath('.//img/@src').extract_first() l1 = l1[1:len(l1)] coll_img = self.urll + l1 print(coll_img) l1 = li.xpath('./a/@href').extract_first() l1 = l1[1:len(l1)] detail_url = self.urll + l1 print(detail_url) print(self.cnt) yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) if (self.cnt <= 7): new_url = urlll[self.cnt + 1] yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): item = collectionItem() coll_list = response.xpath( '/html/body/div[2]/div[3]/div[2]/div[2]/ul/li') for div in coll_list: #/html/body/div[2]/div[3]/div[2]/div[2]/ul/li[1]/div[1]/span coll_name = div.xpath('./div[1]/span/text()').extract_first() print(coll_name) coll_img = 'http:' + div.xpath( './div[2]/div[1]/a/img/@src').extract_first() #/html/body/div[2]/div[3]/div[2]/div[2]/ul/li[4]/div[2]/div[1]/a/img #http://services.ytta.cn print(coll_img) #/html/body/div[2]/div[3]/div[2]/div[2]/ul/li[1]/div[2]/div[1]/a detail_url = 'http://www.ytmuseum.com' + div.xpath( './div[2]/div[1]/a/@href').extract_first() yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) if self.page_num <= 8: new_url = (self.url % self.page_num) self.page_num += 1 yield scrapy.Request(new_url, callback=self.parse)
def parse2(self, response): item = collectionItem() a = (18, 1, 5, 1) div_list = response.xpath( '/html/body/div[1]/div[5]/div[3]/div[2]/ul/li') for li in div_list: coll_name = li.xpath('./div[2]/h3/a/text()').extract_first() #print(coll_name) x = li.xpath('./div[2]/p//text()').extract() x = switch(x) #x=ge(x) #print(x) coll_desc = x if (len(x) > 100): detail_url = 'http://www.81-china.com' + li.xpath( './div[2]/h3/a/@href').extract_first() print(detail_url) yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) coll_img = 'http://www.81-china.com' + li.xpath( './/img/@src').extract_first() print(coll_name) #print(coll_desc) print(coll_img)
def parse(self, response): # //*[@id="building2"]/div/div[2]/table/tbody coll_list = response.xpath( '//*[@id="app"]/div/div/div/div/main/ul/li[@class="col-list-i"]') # print(coll_list) for li in coll_list: item = collectionItem() # if li.xpath('./td/a/text()').extract_first() != None: # //*[@id="227613"]/text() coll_name = li.xpath('./a/h3/text()').extract_first() # coll_name = ''.join(coll_name) print(coll_name) # print(li.xpath('./td/a/@href').extract_first()) detail_url = 'http://www.zhejiangmuseum.com' + li.xpath( './a/@href').extract_first() img = li.xpath('./a/figure/img/@src').extract_first() if img[0] == '/': img = 'http://www.zhejiangmuseum.com' + img print(img) self.deep_urls.append(detail_url) item['collectionName'] = coll_name item['museumID'] = 5 item['collectionImage'] = img yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) if self.page_num <= 291: new_url = (self.url % self.page_num) self.page_num += 1 self.new_urls.append(new_url) yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): #scrapy crawl collection148 item = collectionItem() a=(126,35,5,3) url1='https://www.chinasilkmuseum.com/zgxd/list_22.aspx?page=%d' url2='https://www.chinasilkmuseum.com/xf/list_23.aspx?page=%d' url3='https://www.chinasilkmuseum.com/mzx/list_24.aspx?page=%d' urlll=('1',url1,url2,url3) _list=response.xpath('/html/body/div[1]/div/div[8]/div/ul/li') for li in _list: coll_name=li.xpath('./p/a/text()').extract_first() #detail_url='http://www.westlakemuseum.com'+li.xpath('./td/a/@href').extract_first() coll_name=str.strip(coll_name) coll_img='https://www.chinasilkmuseum.com'+li.xpath('./a/img/@src').extract_first() print(coll_name) print(coll_img) detail_url='https://www.chinasilkmuseum.com'+li.xpath('./a/@href').extract_first() #print((detail_url)) yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item}) if self.page_num <= a[self.cnt-1]: new_url = (self.url%self.page_num) print(new_url) self.page_num += 1 yield scrapy.Request(new_url,callback=self.parse) else : if self.cnt<=3: self.url=urlll[self.cnt] self.cnt+=1 new_url=(self.url%1) print(new_url) self.page_num=2 yield scrapy.Request(new_url,callback=self.parse)
def parse(self, response): # maxn = response.xpath('//*[@class="active"]/text()').extract_first() # maxn = ''.join(maxn) # maxn = int(maxn) # //*[@id="building2"]/div/div[2]/table/tbody # if maxn == '1': # self.cot += 1 coll_list = response.xpath('//*[@id="articleListTable"]/ul/li') for li in coll_list: item = collectionItem() # //*[@id="227613"]/text() coll_name = li.xpath('./a/h5/text()').extract_first() # coll_name = ''.join(coll_name) print(coll_name) # print(li.xpath('./td/a/@href').extract_first()) detail_url = 'http://www.chnmus.net' + li.xpath( './a/@href').extract_first() coll_img = li.xpath('./a/img/@src').extract_first() coll_img = 'http://www.chnmus.net' + coll_img item['collectionName'] = coll_name item['museumID'] = 9 item['collectionImage'] = coll_img print(coll_img) yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) if self.page_num <= 10: # new_url = (self.url%(self.co_list[self.cot],self.page_num)) new_url = (self.url % self.page_num) self.page_num += 1 yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): item = collectionItem() coll_list = response.xpath( '/html/body/div[3]/div/div[2]/table[1]/tbody/tr/td/div/table/tbody/tr[1]/td' ) coll_name = response.xpath( '/html/body/div[3]/div/div[2]/table[1]/tbody/tr/td/div/table/tbody/tr[1]/td[1]/span/a/span/text()' ).extract_first() print(coll_name) #/html/body/div[3]/div/div[2]/table[1]/tbody/tr/td/div/table/tbody/tr[1]/td[1] for div in coll_list: coll_name = div.xpath('./span/a/span/text()').extract_first() print(coll_name) coll_img = div.xpath( './table/tbody/tr/td/a/img/@src').extract_first() print(coll_img) detail_url = div.xpath( './table/tbody/tr/td/a/@href').extract_first() yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) if self.page_num <= 4: new_url = (self.url % self.page_num) self.page_num += 1 yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): item = collectionItem() coll_name = response.xpath( '/html/body/table[3]/tbody/tr/td[3]/table/tbody/tr/td/table/tbody/tr[3]/td/ul/li[1]/a[1]/p//text()' ).extract() coll_name = ''.join(coll_name) print(coll_name)
def parse(self, response): item = collectionItem() coll_list = json.loads(response.text)["data"]["recordsList"] for i in coll_list: collectionName = i["name"] collectionDescription = i["introduce"] collectionImageUrl = i["picUrl"] print((collectionName, collectionDescription, collectionImageUrl))
def parse_content(self, response): item = collectionItem() collectionImageUrl =response.urljoin(response.xpath("//div[@class='collectdetail clearfix ']//img/@src").get()) collectionName = response.xpath( "//h1/text()").get() collectionDescription = "".join("".join(response.xpath( "//div[@class='cont']/p/text()").getall()).split()) print((collectionName, collectionImageUrl, collectionDescription))
def parse(self, response): item = collectionItem() coll_list = json.loads(response.text)["body"]["list"] for i in coll_list: collectionName = i["title"] collectionDescription = i["description"] collectionImageUrl = i["litPic"] print((collectionName, collectionDescription, collectionImageUrl))
def parse(self, response): item = collectionItem() coll_list = json.loads(response.text)["data"] for i in coll_list: collectionName = i["title"] collectionDescription = i["texture"] collectionImageUrl = "https://www.gzchenjiaci.com" + i["imgurl"] print((collectionName, collectionDescription, collectionImageUrl))
def parse(self, response): item = collectionItem() coll_list = json.loads(response.text)["data"]["records"] for i in coll_list: coll_name = i["exhibitName"] coll_desc = i["description"] #coll_img = i[""] print((coll_name, coll_desc))
def parse_content(self, response): item = collectionItem() collectionName = response.xpath("//h1[2]").get() collectionDescription = "".join("".join( response.xpath('//span[@style]/text()').getall()).split()) collectionImageUrl = response.xpath( '//div[@class="newsxx_nr"]//img/@src').get() print((collectionName, collectionImageUrl, collectionDescription))
def parse_content(self, response): item = collectionItem() collectionName = response.xpath("//h2/text()").get() collectionDescription = "".join("".join( response.xpath( '//div[@class="neirong"]/p/text()').getall()).split()) collectionImageUrl = "http://www.gxmuseum.cn" + response.xpath( '//div[@class="neirong"]//img/@src').get() print((collectionName, collectionImageUrl, collectionDescription))
def parse(self, response): item = collectionItem() coll_list = json.loads(response.text)["Rows"] for i in coll_list: coll_name = i["Title"] coll_desc = i["Contents"] #coll_img = i[""] print(coll_name) print(coll_desc)
def parse(self, response): item = collectionItem() li_list = json.loads(response.text)['data'] for li in li_list: collectionName = li['name'] # print(collectionName) collectionIntroduction = li['introduce'] # print(collectionIntroduction) collectionImage = li['imgPath']
def parse_content(self, response): item = collectionItem() collectionImageUrl = "http://www.sunyat-sen.org" + response.xpath( "//div[@class='zwpic']/img/@src").get() collectionName = response.xpath("//h3/text()").get() collectionDescription = "".join("".join( response.xpath( "//div[@class='contentBox']//text()").getall()).split()) print((collectionName, collectionImageUrl, collectionDescription))
def parse_content(self, response): item = collectionItem() collectionImageUrl = response.meta['img'] collectionName = response.xpath( "//div[@class='titleBox']/p/text()").get() collectionDescription = "".join("".join( response.xpath( "//div[@class='textBox']//text()").getall()).split()) print((collectionName, collectionImageUrl, collectionDescription))
def parse(self, response): item = collectionItem() coll_list = json.loads(response.text)["data"] for i in coll_list: coll_name = i["collectionName"] #coll_desc = i["mipOpenCulturalrelicInfo"]["collectionsCategory"] coll_img = i["picUrl"] print(coll_name) print(coll_img)
def parse(self, response): item = collectionItem() coll_list = response.xpath('/html/body/div[5]/div/div/div[2]/div') for div in coll_list: #detail_url = '1' if div.xpath('./a/@href'): detail_url = 'http://www.qdyzyzmuseum.com' + div.xpath('./a/@href').extract_first() yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})
def parse_content(self, response): item = collectionItem() collectionImageUrl = "https://www.jc-museum.cn" + \ response.xpath("//div[@class='box2 wf100']/img/@src").get() collectionName = response.xpath( "//div[@class='box1 wf100']/span/text()").get() description = "".join("".join(response.xpath( "//div[@class='box2 wf100']//text()").getall()).split()) print((collectionName, collectionImageUrl, description))
def parse(self, response): item = collectionItem() coll_list = json.loads(response.text)["data"]["data"] for i in coll_list: collectionName = i["mingchen"] collectionDescription = i["niandai"] + " " + i["leibie"] + " " + i[ "chicun"] + " " + i["baocun_zhuangtai"] collectionImageUrl = "http://www.hainanmuseum.org/cms/1/image/public/wenwu/" + i[ "pics"][0] + ".png" print((collectionName, collectionDescription, collectionImageUrl))
def content_parse(self, response): item = collectionItem() doc = response.xpath("//div[@class='zhanlan-pic']") collectionName = doc.xpath( "./div[@class='list-right-bt']/text()").get() collectionImageUrl = "http://www.hylae.com" + doc.xpath( ".//img/@src").get() collectionDescription = "".join("".join( doc.xpath(".//p//text()").getall()).split()) #去除\xa0字符 print((collectionName, collectionImageUrl, collectionDescription))
def parse(self, response): item = collectionItem() coll_list = json.loads(response.text)["data"] for i in coll_list: collectionName = i["Title"] contentHTML = i["Contents"] Selector = etree.HTML(contentHTML) collectionDescription = "".join(Selector.xpath("//p/text()")) collectionImageUrl = Selector.xpath("//img/@src")[0] print((collectionName, collectionDescription, collectionImageUrl))
def parse_content(self, response): item = collectionItem() collectionImageUrl = "http://www.ynmuseum.org" + \ response.xpath( "//div[@class='yc_info']/img/@src").get() collectionDescription = "".join("".join( response.xpath( "//div[@class='yc_infoCon']/p//text()").getall()).split()) collectionName = collectionDescription.split("》")[0] + "》" print((collectionName, collectionImageUrl, collectionDescription))
def parse(self, response): item = collectionItem() url_list = response.xpath( "//ul[@class='prod_list cf']/li/a/@href").getall() for i in url_list: yield scrapy.Request("http://www.ynmuseum.org" + i, callback=self.parse_content) next_page = response.xpath( "//div[@class='page_w']/a[@class='next']/@href").get() if next_page != None: yield scrapy.Request("http://www.ynmuseum.org" + next_page)
def parse(self, response): item = collectionItem() doc = response.xpath("//div[@class='list-right']") educationName = doc.xpath( ".//div[@class='list-right-bt']/text()").get() educationImageUrl = "http://www.hylae.com" + doc.xpath( ".//img/@src").get( default="/upfile/2019/07/20190731171111_475.jpg") educationDescription = "".join("".join( doc.xpath(".//p//text()").getall()).split()) #去除\xa0字符 print((educationName, educationImageUrl, educationDescription))