def parse(self, response): print('1,=========================',response.url) text = response.text # print(text) item = TNovelItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join(response.xpath('//h1/em/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P33' print('plat_number:', plat_number) item["plat_number"] = plat_number author = ''.join(response.xpath('//h1/a[@class="writer default"]/text()').extract()).strip().replace('著','') item["author"] = author print('author:', author) novel_type = ';'.join(response.xpath('//p[@class="tag-box"]/span/i[position()>3]/text()').extract()).strip() item["novel_type"] = novel_type print('novel_type:', novel_type) tags = None item["tags"] = tags Signed_s = ''.join(response.xpath('//p[@class="tag-box"]/span/i/text()').extract()).strip() if '签约' in Signed_s: Signed = 1 else: Signed = 0 item["Signed"] = Signed print('Signed:', Signed) novel_desc = ''.join(response.xpath('//div[@class="book-information cf"]/div[@class="book-info"]/p[@class="intro"]/text()').extract()).strip() item["novel_desc"] = novel_desc print('novel_desc:', novel_desc) Product_image = plat_number + product_number Product_image = hashlib.md5(Product_image.encode(encoding='UTF-8')).hexdigest() print('Product_image:', Product_image) item["Product_image"] = Product_image P_image = 'http:' + ''.join(response.xpath('//*[@id="bookImg"]/img/@src').extract()).strip() print('P_image:', P_image) root = "../images//" path = root + Product_image try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(P_image) r.raise_for_status() # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地 with open(path, "wb") as f: # 开始写文件,wb代表写二进制文件 f.write(r.content) print("图片本地存储完成") else: print("文件已存在") except Exception as e: print("图片本地存储失败:" + str(e)) last_modify_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) yield item
def parse(self, response): print('1,========================', response.url) text = response.text # print(text) item = TNovelItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h2/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P31' print('plat_number:', plat_number) item["plat_number"] = plat_number author = ''.join( response.xpath( '//div[@class="author-zone column-2"]/div[@class="right"]/a[@class="name"]//text()' ).extract()).strip() item["author"] = author print('author:', author) novel_type = ';'.join( response.xpath('//p[@class="infos"]/span[@class="cate"]/a/text()'). extract()).strip() item["novel_type"] = novel_type print('novel_type:', novel_type) tags = None # if '、' in tags_s: # tags = tags_s.replace('、', ';') # else: # tags = tags_s item["tags"] = tags print('tags:', tags) Signed = None print('Signed:', Signed) # if '签约作品' in Signed: # Signed = 1 # else: # Signed = 0 item["Signed"] = Signed # print('Signed:', Signed) novel_desc = ''.join( response.xpath( '//div[@class="summary min-summary-height"]/pre[@class="note"]/text()' ).extract()).strip() item["novel_desc"] = novel_desc print('novel_desc:', novel_desc) Product_image = plat_number + product_number Product_image = hashlib.md5( Product_image.encode(encoding='UTF-8')).hexdigest() print('Product_image:', Product_image) item["Product_image"] = Product_image P_image = ''.join( response.xpath( '//div[@class="pic"]/a/img/@src//div[@class="pic"]/a/img/@src' ).extract()).strip() print('P_image:', P_image) root = "../images//" path = root + Product_image try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(P_image) r.raise_for_status() # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地 with open(path, "wb") as f: # 开始写文件,wb代表写二进制文件 f.write(r.content) print("图片本地存储完成") else: print("文件已存在") except Exception as e: print("图片本地存储失败:" + str(e)) last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) yield item
def parse(self, response): print('1,=====================', response.url) text = response.text # print(text) item = TNovelItem() url = response.url src_url = url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//img[@class="qqredaer_tit"]/@title').extract()).strip() product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P17' print('plat_number:', plat_number) item["plat_number"] = plat_number author = ''.join( response.xpath( '//*[@id="textauthor"]/following-sibling::p/a/text()').extract( )).strip() item["author"] = author print('author:', author) novel_type = ';'.join( response.xpath( '//div[@class="title"]/a[position()>1 and position()<last()]/text()' ).extract()).strip() item["novel_type"] = novel_type print('novel_type:', novel_type) tags = ''.join( response.xpath( '//div[@class="tags"]/text()').extract()).strip().replace( '作品标签:', '') if tags: tags = tags.replace('、', ';').strip() item["tags"] = tags print('tags:', tags) Signed = ''.join( response.xpath( '//div[@class="tag"]/div[@class="y"]/a[@title]/text()'). extract()).strip() if '签约作品' in Signed: Signed = 1 else: Signed = 0 item["Signed"] = Signed print('Signed:', Signed) novel_desc = ''.join( response.xpath('//div[@class="info"]//text()').extract()).strip() item["novel_desc"] = novel_desc print('novel_desc:', novel_desc) Product_image = plat_number + product_number Product_image = hashlib.md5( Product_image.encode(encoding='UTF-8')).hexdigest() print('Product_image:', Product_image) item["Product_image"] = Product_image P_image = 'http:' + ''.join( response.xpath( '//div[@class="cover"]/a[@class="bookcover"]/img/@src'). extract()).strip() print('P_image:', P_image) root = "../images//" path = root + Product_image try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(P_image) r.raise_for_status() # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地 with open(path, "wb") as f: # 开始写文件,wb代表写二进制文件 f.write(r.content) print("图片本地存储完成") else: print("文件已存在") except Exception as e: print("图片本地存储失败:" + str(e)) last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) yield item
def parse(self, response): print('1,=========================', response.url) text = response.text # print(text) item = TNovelItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//div[@class="main"]/div[@class="status fl"]/h1/a/text()'). extract()).strip() if '【' and '】' in product_number: product_number = product_number.replace('【', '[').replace('】', ']') print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number else: product_number = product_number product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P21' item["plat_number"] = plat_number print('plat_number:', plat_number) author = ''.join( response.xpath( '//div[@class="main"]/div[@class="status fl"]/div[@class="booksub"]/a[@title]/text()' ).extract()).strip() print('author:', author) item["author"] = author novel_type = ''.join( response.xpath( '//div[@class="main"]/div[@class="status fl"]/div[@class="booksub"]/a[last()]/text()' ).extract()).strip() # if '-' in novel_type: # novel_type = novel_type.replace('-',';') print('novel_type:', novel_type) item["novel_type"] = novel_type tags = response.xpath( '//div[@class="main"]/div[@class="status fl"]/div[@class="keyword"]/a[@title]/text()' ).extract() time.sleep(1) tags = ';'.join(tags) print('tags:', tags) item["tags"] = tags Signed = ''.join( response.xpath( '//div[@class="main"]/div[@class="status fl"]/h1/em[@class="sign"]/@title' ).extract()).strip() if '签约作品' in Signed: Signed = 1 else: Signed = 0 item["Signed"] = Signed print('Signed:', Signed) novel_desc = response.xpath( '//div[@class="main"]/div[@class="status fl"]/div[@class="info_con"]/p/text()' ).extract() novel_desc = ' '.join(''.join(novel_desc).split('\r')) # print(novel_desc) item["novel_desc"] = novel_desc print('novel_desc:', novel_desc) Product_image = plat_number + product_number Product_image = hashlib.md5( Product_image.encode(encoding='UTF-8')).hexdigest() print('Product_image:', Product_image) item["Product_image"] = Product_image P_image = ''.join( response.xpath( '//div[@class="main"]/div[@class="book_cover fl"]/p/a/img[@title]/@src' ).extract()).strip() print('P_image:', P_image) root = "../images//" path = root + Product_image try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(P_image) r.raise_for_status() # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地 with open(path, "wb") as f: # 开始写文件,wb代表写二进制文件 f.write(r.content) print("图片本地存储完成") else: print("文件已存在") except Exception as e: print("图片本地存储失败:" + str(e)) last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) yield item
def parse_page_p(self, response): print('2,=======================', response.url) item = response.meta["item"] text = response.text # print(response.text) jsons = json.loads(text) # print(jsons) data = jsons.get('data') # print(data) product_number = data.get('book_name') if '【' and '】' in product_number: product_number = product_number.replace('【', '[').replace('】', ']') print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number else: product_number = product_number product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P19' item["plat_number"] = plat_number print('plat_number:', plat_number) author = data.get('author_name') print('author:', author) item["author"] = author novel_type = data.get('class_name') print('novel_type:', novel_type) item["novel_type"] = novel_type tags = None print('tags:', tags) item["tags"] = tags Signed = data.get('is_end_write') item["Signed"] = Signed print('Signed:', Signed) novel_desc = data.get('description') print('novel_desc:', novel_desc) item["novel_desc"] = novel_desc Product_image = plat_number + product_number Product_image = hashlib.md5( Product_image.encode(encoding='UTF-8')).hexdigest() print('Product_image:', Product_image) item["Product_image"] = Product_image P_image = data.get('cover') print('P_image:', P_image) root = "../images//" path = root + Product_image try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(P_image) r.raise_for_status() # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地 with open(path, "wb") as f: # 开始写文件,wb代表写二进制文件 f.write(r.content) print("图片本地存储完成") else: print("文件已存在") except Exception as e: print("图片本地存储失败:" + str(e)) last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date)
def parse(self, response): print('1,================',response.url) # print(response.text) item = TNovelItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join(response.xpath('//div[@class="Info Sign"]/h1/a[@target="_blank"]/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P22' print('plat_number:', plat_number) item["plat_number"] = plat_number author = ''.join(response.xpath('//div[@class="author"]/a[@class="name"]/text()').extract()).strip() item["author"] = author print('author:',author) novel_type = ''.join(response.xpath('//dl[@class="Tab"]/dd/div[2]/table/tr[1]/td/a/text()').extract()).strip() item["novel_type"] = novel_type print('novel_type:',novel_type) tags_s = ';'.join(response.xpath('//dl[@class="Tab"]/dd/div[2]/table/tr[last()]/td/a/span/text()').extract()).strip() if '、' in tags_s: tags = tags_s.replace('、',';') else: tags = tags_s item["tags"] = tags print('tags:',tags) Signed = ''.join(response.xpath('//dl[@class="Tab"]/dd/div[2]/table/tr[1]/td/span/text()').extract()).strip() # print('Signed:',Signed) if '签约作品' in Signed: Signed = 1 else: Signed = 0 item["Signed"] = Signed print('Signed:',Signed) novel_desc = ''.join(response.xpath('//dl[@class="Tab"]/dd/div[1]/a//text()').extract()).strip() item["novel_desc"] = novel_desc print('novel_desc:',novel_desc) Product_image = plat_number + product_number Product_image = hashlib.md5(Product_image.encode(encoding='UTF-8')).hexdigest() print('Product_image:', Product_image) item["Product_image"] = Product_image P_image = ''.join(response.xpath('//div[@id="bookCover"]/a/img/@src').extract()).strip() print('P_image:', P_image) root = "../images//" path = root + Product_image try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(P_image) r.raise_for_status() # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地 with open(path, "wb") as f: # 开始写文件,wb代表写二进制文件 f.write(r.content) print("图片本地存储完成") else: print("文件已存在") except Exception as e: print("图片本地存储失败:" + str(e)) last_modify_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) yield item
def parse(self, response): print('1,================',response.url) # print(response.text) item = TNovelItem() product_number = ''.join(response.xpath('//h1[@itemprop="name"]/span/text()').extract()).strip() if '【' and '】' in product_number: product_number = product_number.replace('【','[').replace('】',']') print('product_number:',product_number) product_number = get_product_number(product_number) print('product_number:',product_number) item["product_number"] = product_number else: product_number = product_number product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P16' item["plat_number"] = plat_number print('plat_number:',plat_number) author = ''.join(response.xpath('//*[@itemprop="author"]/text()').extract()).strip() print('author:',author) item["author"] = author novel_type = ''.join(response.xpath('//*[@itemprop="genre"]/text()').extract()).strip() if '-' in novel_type: novel_type = novel_type.replace('-',';') print('novel_type:',novel_type) item["novel_type"] = novel_type tags = response.xpath('//*[@class="smallreadbody"]/span/a/text() | //div[@class="smallreadbody"]/span[@style="color: red;"]//text()').extract() time.sleep(1) tags = ';'.join(tags) print('tags:',tags) item["tags"] = tags Signed = ''.join(response.xpath('//div[@class="righttd"]/ul[@class="rightul"]/li[last()-1]/b//text()').extract()).strip() if '已签约' in Signed: Signed = 1 else: Signed = 0 item["Signed"] = Signed print('Signed:', Signed) novel_desc = ''.join(response.xpath('//div[@id="novelintro"]//text()').extract()).strip() print('novel_desc:',novel_desc) item["novel_desc"] = novel_desc Product_image = plat_number + product_number Product_image = hashlib.md5(Product_image.encode(encoding='UTF-8')).hexdigest() print('Product_image:', Product_image) item["Product_image"] = Product_image P_image = ''.join(response.xpath('//img[@itemprop="image"]/@src').extract()).strip() print('P_image:',P_image) root = "../images//" path = root + Product_image try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(P_image) r.raise_for_status() # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地 with open(path, "wb") as f: # 开始写文件,wb代表写二进制文件 f.write(r.content) print("图片本地存储完成") else: print("文件已存在") except Exception as e: print("图片本地存储失败:" + str(e)) last_modify_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) src_url = response.url item["src_url"] = src_url print('src_url:',src_url) yield item
def parse(self, response): print('1,=====================', response.url) text = response.text # print(text) item = TNovelItem() url = response.url src_url = url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//*[@id="bookinfo"]/div[@class="book_info"]/h3/a[@name="readurl"]/text()' ).extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P17' print('plat_number:', plat_number) item["plat_number"] = plat_number author = ''.join( response.xpath( '//*[@id="bookinfo"]/div[@class="book_info"]/dl/dd[@class="w_au"]/a/text()' ).extract()).strip() item["author"] = author print('author:', author) novel_type_s = ';'.join( response.xpath( '//*[@id="nav"]/a[position()>1]/text() | //dd[@class="w_auth"]/a/text()' ).extract()).strip() print('novel_type_s:', novel_type_s) if '/' in novel_type_s: novel_type = novel_type_s.replace('/', ';') else: novel_type = novel_type_s item["novel_type"] = novel_type print('novel_type:', novel_type) tags = None item["tags"] = tags print('tags:', tags) Signed = None item["Signed"] = Signed novel_desc = ''.join( response.xpath( '//*[@id="bookIntro"]/p/text() | //*[@id="bookIntro"]/text()'). extract()).strip() item["novel_desc"] = novel_desc print('novel_desc:', novel_desc) Product_image = plat_number + product_number Product_image = hashlib.md5( Product_image.encode(encoding='UTF-8')).hexdigest() print('Product_image:', Product_image) item["Product_image"] = Product_image P_image = 'http:' + ''.join( response.xpath( '//*[@id="bookinfo"]/div[@class="bookBox"]/a/img/@src'). extract()).strip() print('P_image:', P_image) root = "../images//" path = root + Product_image try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(P_image) r.raise_for_status() # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地 with open(path, "wb") as f: # 开始写文件,wb代表写二进制文件 f.write(r.content) print("图片本地存储完成") else: print("文件已存在") except Exception as e: print("图片本地存储失败:" + str(e)) last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) yield item