def start_requests(self): file = open("inputs/rrc_titles.txt", "r") for t in file.readlines(): self.tlts.append(strHelper.format(t)) file.close() file = open("inputs/rrclinks.txt", "r") for link in file.readlines(): if "http" not in link: continue base_url = strHelper.format(link) request = scrapy.Request(base_url, callback=self.parseCars, dont_filter=True) yield request
def start_requests(self): # url = "http://product.auto.163.com/series/config1/3148.html" # yield scrapy.Request(url,self.parse) titleFile = open("inputs/titles.txt", "r") for line in titleFile.readlines(): self.extractedTitle.append(sh.format(line)) titleFile.close() base_url = "http://product.auto.163.com/series/config1/" file = open("F:/ids.txt", "r") # i=0 for line in file.readlines(): # if i<2: # i+=2 # else: # break id = sh.format(line) url = base_url + id + ".html" request = scrapy.Request(url, self.parse) yield request file.close()
def start_requests(self): file = open("F:/163links.txt") for link in file.readlines(): link = link.split(",") url = link[0] comment = link[1] re = scrapy.Request(url, callback=self.parseOnePage, dont_filter=True, meta={"comment": strHelper.format(comment)}) yield re file.close()
def start_requests(self): # url = "https://www.autohome.com.cn/use/201711/909485.html#pvareaid=102624" # base_url = "http://auto.sina.com.cn/service/?page=" # yield scrapy.Request(url,self.parseOnePage) # base_url = "http://auto.sina.com.cn/j_kandian.d.html?docid=fyremfz2599182" file = open("F:/sinalinks.txt", "r") for link in file.readlines(): re = scrapy.Request(strHelper.format(link), callback=self.parseOnePage, dont_filter=True) yield re
def parseOnePage(self, response): pageLink = response.url id = response.url.split("/") id = id[len(id) - 1].split(".h")[0] title = strHelper.format(response.xpath("//h1/text()").extract()[0]) source = response.xpath( "//a[@id='ne_article_source']/text()").extract()[0] time = response.xpath( "//div[@class='post_time_source']/text()").extract()[0] time = time.strip().split(" ")[0] comment = response.meta['comment'] contents = response.xpath("//div[@class='post_text']/p") pictures = [] fulltext = "" for content in contents: _class = content.xpath("@class").extract() if len(_class) > 0 and "center" in _class[0]: pic = content.xpath("./img/@src").extract_first() pictures.append(pic) fulltext += "INSERT_PIC_HERE\n" else: text = content.xpath(".//text()").extract() for t in text: fulltext += t fulltext += "\n" file = open("F:/163/" + id + ".txt", "w") file.write("link: " + pageLink + "\n\n") file.write("title: " + title + "\n\n") file.write("time: " + time + "\n\n") file.write("source: " + source + "\n\n") file.write(fulltext + "\n") file.write("comment:" + comment + "\n\n") file.write("pic links:") for pic in pictures: print(pic) file.write(pic) file.close() print('****************************') # return allCars
def start_requests(self): # url = "https://www.autohome.com.cn/use/201711/909485.html#pvareaid=102624" # base_url = "http://auto.sina.com.cn/service/?page=" # yield scrapy.Request(url,self.parseOnePage) # base_url = "http://auto.sina.com.cn/j_kandian.d.html?docid=fyremfz2599182" link = 'http://www.pcauto.com.cn/drivers/yangche/point/' # re = scrapy.Request( # strHelper.format(link), # callback=self.parseHome, # dont_filter=True # ) # yield re for i in range(9, 11): re = scrapy.Request(strHelper.format(link) + "index_" + str(i) + ".html", callback=self.parseHome, dont_filter=True) yield re
def parse(self, response): print(response.url) tt = self.tlts dict = {} bb = response.xpath( "//p[@class='detail-breadcrumb-tagP']/a/text()").extract() # 品牌,型号,名称 dict[tt[0]] = strHelper.numberTrans(bb[2]) dict[tt[1]] = strHelper.numberTrans(bb[3]) dict[tt[2]] = strHelper.numberTrans(bb[4]) price = response.xpath( "//p[@class='price detail-title-right-tagP']/text()" ).extract_first() price = price[1:len(price)] dict[tt[3]] = price bbbbasicAttr = response.xpath( "//div[@class='row-fluid-wrapper']//li//strong/text()") license_city = response.xpath( "//div[@class='row-fluid-wrapper']//li//strong[@id]/@licensed-city" ).extract_first() # print(license_city) basicAttr = [] for t in bbbbasicAttr: t = t.extract() basicAttr.append(strHelper.numberTrans(t)) basicAttr.append(license_city) for i in range(0, len(basicAttr)): dict[tt[i + 4]] = basicAttr[i] extendedAttrs = response.xpath("//div[@id='js-parms-table']//table") for table in extendedAttrs: trs = response.xpath(".//tr") trs = trs[1:len(trs)] for tr in trs: tds = tr.xpath(".//td") for td in tds: title = td.xpath( "./div[@class='item-name']/text()").extract_first() title = strHelper.format(title) title = title.encode("utf-8") value = td.xpath( "./div[@class='item-value']/text()").extract_first() value = strHelper.format(value) value = value.encode("utf-8") dict[title] = value keys = dict.keys() for title in tt: if title not in keys: dict[title] = "无".encode("utf-8") rexcel = xlrd.open_workbook("inputs/ershou.xls") row_count = rexcel.sheets()[0].nrows excel = copy(rexcel) sheet = excel.get_sheet(0) # for i in range(0,len(tt)): # sheet.write(0, i, tt[i]) for i in range(0, len(tt)): sheet.write(row_count, i, dict[tt[i]]) i += 1 sheet.write(row_count, i, response.url) excel.save("inputs/ershou.xls") # # titles=[] # titles.append("品牌".decode()) # titles.append("型号".decode()) # titles.append("名称".decode()) # titles.append("报价".decode()) # # basicAttrs = response.xpath("//div[@class='row-fluid-wrapper']//p[@class='small-title']/text()").extract() # for t in basicAttrs: # titles.append(strHelper.format(t)) # # extendedAttrs = response.xpath("//div[@id='js-parms-table']//table//div[@class='item-name']/text()").extract() # for t in extendedAttrs: # titles.append(strHelper.format(t)) # file = open("inputs/rrc_titles.txt","w") # for t in titles: # file.write(t+"\n") # file.close() # return allCars
def parse(self, response): error = response.xpath("//div[@id='nav_hd']") if (error): return attributes = ['year', 'engine', 'product_id', 'product_name', 'price'] attributes2 = [ 'year', 'engine', 'product_id', 'product_name', 'price', 'brand', 'web_id', 'link' ] basic = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', ] brand = response.xpath("//a[@class='menu_name']/text()").extract() product_brand = brand[1] product_name = brand[2] attrnames = response.xpath( "//div[@class='car_config_param_names']/div") allCars = [] # names = "" # for n in attrnames: # names+=" "+n # print(names) print(len(attrnames)) #outputs product basic information\ cars = response.xpath( "//div[@class='car_config_param_head']//div[@class='cell']") for car in cars: for y in car.xpath("./@data-config").extract(): x = y.decode("string-escape") mm = x.split(",") pattern = re.compile("'(.*)'") item = {} for i in range(len(mm)): item[attributes[i]] = pattern.findall(mm[i])[0] fullname = "" for name in car.xpath( ".//a[@target='_blank']/text()").extract(): fullname = fullname + " " + name item['product_name'] = fullname item['brand'] = product_brand item['web_id'] = response.url.split("g1/")[1].split(".")[0] item['link'] = response.url allCars.append(item) carCount = len(allCars) dicts = [] for number in range(0, carCount): dddd = {} dicts.append(dddd) index = 0 start = False basicAttributes = response.xpath( "//div[@class='car_config_param_list']/div") # print(len(basicAttributes)) countDown = 0 curAttrs = {} count = 0 titles = [] for i in range(0, len(basicAttributes)): _attribute = basicAttributes[i] _title = attrnames[i] _class = _title.xpath("./@class").extract_first() if "head" in _class: continue count += 1 _titleN = _title.xpath(".//span/@title").extract_first() titles.append(_titleN) spans = _attribute.xpath(".//span/text()").extract() for j in range(0, len(allCars)): text = spans[j] if "●" in text: text = "标配" if "○" in text: text = "选配" allCars[j][_titleN] = sh.format(text) fulltext = "" for i in range(0, len(allCars)): for att in attributes2: fulltext = fulltext + att + "=" + sh.format( allCars[i][att]) + "$" for dd in self.extractedTitle: if dd in allCars[i].keys(): fulltext = fulltext + dd + "=" + allCars[i][dd.decode( 'utf-8')] + "$" else: fulltext = fulltext + dd + "=--$" print(count) # file = open("inputs/titles.txt","w") # for t in titles: # file.write(t+" = scrapy.Field()\n") # file.close() file = open("crawled/163.txt", "a") file.write(fulltext + "\n") file.close() # return allCars print('===============')
def parseOnePage(self, response): pictures = [] meta = response.xpath( "//meta[@name='comment']/@content").extract_first() id = meta.split("_id:")[1] channel = meta.split("channel:")[1] channel = channel[0:2] details = response.xpath("//div[@class='article clearfix']") isA = False if "article_" in response.url: isA = True contents = details.xpath(".//p|.//div") fulltext = "" for con in contents: _class = con.xpath("./@class") if len(_class) > 0: try: if isA: link = "http:" + con.xpath( "./img/@src").extract_first() else: link = con.xpath("./img/@src").extract_first() pictures.append(link) fulltext += "INSERT_PIC_HERE\n\n" except TypeError: print("") else: if not isA: if "docid" in response.url: text = con.xpath("./font/text()").extract() else: text = con.xpath("./text()").extract() else: text = con.xpath("./font/text()").extract() if len(text) > 0: text = strHelper.format(text[0]) fulltext += text + "\n" mark = response.xpath("//div[@class='keywords']/a/text()") mark = mark.extract() if mark is None: mark = [] marks = "" for m in mark: marks += m + "," marks = marks[0:len(marks) - 1] title = details.xpath("//h1/text()").extract_first() title = strHelper.format(title) tANDs = response.xpath("//div[@class='date-source']") time = tANDs.xpath("./span[@class='date']/text()").extract_first() time = strHelper.format(time) source = tANDs.xpath("./a/text()").extract_first() source = strHelper.format(source) print(time) print(source) pageLink = response.url commentlink = "http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=" \ "{channel}" \ "&newsid={id}&group=undefined&compress=0&ie=utf-8&oe" \ "=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&" \ "callback=jsonp_1528781515017&_=1528781515017".format(channel=channel,id=id) yield scrapy.Request(commentlink, self.parseComment) # if "article" in pageLink: # id = pageLink.split("article_")[1].split(".html")[0] # id = id.split("_") # id = id[0]+"-"+id[1] # basecommentlink = "http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=mp&newsid=" # end= "&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&callback=jsonp_1528781515017&_=1528781515017" # commentlink = basecommentlink+id+end # yield scrapy.Request(commentlink, self.parseComment) # # elif "detail" in pageLink: # id = pageLink.split("detail-i")[1].split(".s")[0] # basecommentlink = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=qc&newsid=comos-" # end="&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20&jsvar=loader_1528783247454_78178050" # commentlink = basecommentlink+id+end # yield scrapy.Request(commentlink, self.parseComment) # else: # id = pageLink.split("docid=")[1] # download images picCount = 0 for link in pictures: # Create folder for each document fileHelper.mkdir(id) print(link) if link is None: continue if "http" not in link: link = "http:" + link urllib.urlretrieve( link, "F:/sina/images/" + id + "/" + str(picCount) + ".jpg") picCount += 1 print("Get pic :" + str(picCount + 1)) # file = open("F:/sina/" + id + ".txt", "w") file.write("link: " + pageLink + "\n\n") file.write("title: " + title + "\n\n") file.write("time: " + time + "\n\n") file.write("source: " + source + "\n\n") file.write("tag:" + marks + "\n\n") file.write(fulltext + "\n") file.close() print('****************************') # return allCars
def parseOnePage(self, response): originurl = response.url id = self.getID(originurl) nextpage = response.xpath("//div[@class='athm-page__info']") if len(nextpage)>0: if "all" not in id: newID = id+"-all.html" yield scrapy.Request("https://www.autohome.com.cn/use/201803/"+newID,self.parseOnePage) return details = response.xpath("//div[@class='article-details']") marks = details.xpath(".//div[@class='marks']/a/text()").extract_first() if marks is None: marks = "" title = details.xpath("//h1/text()").extract_first() title = strHelper.format(title) author = details.xpath("//a[@class='name']/text()").extract_first() author = strHelper.format(author) time = details.xpath("//span[@class='time']/text()").extract_first() time = strHelper.format(time) source = details.xpath("//span[@class='source']/a/text()").extract_first() source = strHelper.format(source) pageLink = response.url commentjsonurl = "https://reply.autohome.com.cn/showreply/ReplyJson.ashx?id=" commentjsonurl+=id yield scrapy.Request(commentjsonurl,self.parseComment) content = details.xpath("//div[@class='details']//p") # download images picCount =0 for c in content: if c.xpath("./@align").extract_first()=="center": link = c.xpath("./a") href = link.xpath("./@href").extract() if len(href)<1: continue # print(href) if "pay" in href[0]: print("pay") continue link = link.xpath("./img/@src").extract_first() if link is not None: link = "https:"+link # Create folder for each document fileHelper.mkdir(id) # print(link) urllib.urlretrieve(link,"F:/images/"+id+"/"+str(picCount)+".jpg") picCount+=1 print("Get pic :"+str(picCount+1)) file = open("txt/"+id + ".txt", "w") file.write("link: "+pageLink+"\n\n") file.write("title: "+title+"\n\n") file.write("author: "+author+"\n\n") file.write("time: "+time+"\n\n") file.write("source: "+source+"\n\n") file.write("tag:" + marks + "\n\n") paras = [] for text in content: if text.xpath("./@align").extract_first() =='center': paras.append("\n INSERT_PIC_HERE \n") continue con = text.xpath(".//text()").extract() para = "" for c in con: c = c.replace(u'\xa0',u'') para+=c paras.append(para) paras[len(paras)-1]="" for para in paras: file.write(para+"\n") file.close() print('****************************') # return allCars
def parseOnePage(self, response): page_link = response.url.replace("_all", "") base_link = "http://cmt.pcauto.com.cn/action/topic/get_data.jsp?url=" yield scrapy.Request(base_link + page_link, self.parseComment) id = page_link.split("/") id = id[len(id) - 1].split(".html")[0] title = response.xpath( "//h1[@class='artTit']/span/text()").extract_first() title = strHelper.format(title) source = response.xpath( "//span[@class='ownner']/text()").extract_first() source = strHelper.format(source) author = strHelper.format( response.xpath( "//span[@class='editor']//a/text()").extract_first()) time = strHelper.format( response.xpath("//span[@class='pubTime']/text()").extract_first()) mark = response.xpath( "//p[@class='moreRead artTag']//a/text()").extract() marks = "" for m in mark: marks += m + "," marks = marks[0:len(marks) - 1] # print(title) # print(source) # print(author) contents = response.xpath("//div[@class='artText clearfix']") contents = contents.xpath(".//p|.//div[@class='cmsArtMainTit']") pictures = [] fulltext = "" for block in contents: if len(block.xpath("./@class")) > 0: fulltext += block.xpath(".//text()").extract_first() + "\n" elif len(block.xpath("./@style")) > 0: fulltext += "INSERT_PIC_HERE\n\n" pic = block.xpath(".//img/@src").extract_first() pictures.append(pic) else: text = block.xpath(".//text()").extract() for t in text: fulltext += t fulltext += "\n" picCount = 0 # for link in pictures: # Create folder for each document # fileHelper.mkdir(id) # print(link) # if link is None: # continue # urllib.urlretrieve(link, "F:/sina/images/" + id + "/" + str(picCount) + ".webp") # picCount += 1 # print("Get pic :" + str(picCount)) # print(fulltext) file = open("F:/pcauto/" + id + ".txt", "w") file.write("link: " + page_link + "\n\n") file.write("title: " + title + "\n\n") file.write("time: " + time + "\n\n") file.write("source: " + source + "\n\n") file.write("tag:" + marks + "\n\n") file.write(fulltext + "\n") file.close()