def handlejs(self, url): item = FhwItem() driver = webdriver.Chrome() driver.get(url) LMLJ2 = driver.find_element_by_xpath('//div[@class="theLogo"]/div/a[2]|//div[@class="h_nav"]/a[2]').text if len(driver.find_element_by_xpath( '//div[@class="theLogo"]/div/a[2]|//div[@class="h_nav"]/a[2]').text) > 0 and LMLJ2 == "台湾" or LMLJ2 == "大陆" or LMLJ2 == "国际" or LMLJ2 == "港澳": CYRS = driver.find_element_by_xpath('//em[@class="js_joinNum"][1]').text PLS = driver.find_element_by_xpath('//em[@class="js_cmtNum"][1]').text TJS = driver.find_element_by_xpath('//div[@id="left_dz"]/span').text BT = driver.find_element_by_xpath( '//div[@id="titL"]/h1|//div[@class="yc_tit"]/h1|//h1[@id="artical_topic"]').text XWLY = driver.find_element_by_xpath( '//div[@id="yc_con_txt"]/p|//span[@class="ss03"]/a|//span[@class="ss03 weMediaicon"]/a').text BZ = driver.find_element_by_xpath( '//div[@id="artical_sth2"]/p[1]').text LMLJ1 = driver.find_element_by_xpath( '//div[@class="speNav js_crumb"]/a[1]|//div[@class="h_nav"]/a[1]|//div[@class="theLogo"]/div/a[1]').text LMLJ = "" LMLJ += LMLJ1 + ";" + LMLJ2 CGSJ = driver.find_element_by_xpath('//span[@class="ss01"]').text ss = driver.find_element_by_xpath( '//p[@class="detailPic"]/img|//div[@class="yc_con_txt"]/p/img|//div[@id="main_content"]/p/img|//div[@class="box02"][1]/img') ZWWB = driver.find_element_by_xpath('//div[@id="main_content"]').text TP_URL = ss.get_attribute('src') ZWNR = ZWWB + "|" + TP_URL item['BTIT'] = BT item['CYRS'] = CYRS item['PLS'] = PLS item['XWLY'] = XWLY item['LMLJ'] = LMLJ item['ZWWB'] = ZWWB item['BZ'] = BZ item['CGSJ'] = CGSJ item['TJS'] = TJS item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') item['ZWNR'] = ZWNR item['TP_URL'] = TP_URL item['LMLJ2'] = LMLJ2 driver.close() return item
def parse_datel(self, response): ''' 该方法用于处理详情页信息 通过xpath定位出相应的数据 ''' print("====", response.url) item = FhwItem() # 文章标题 title = response.xpath('//div[@class="yc_tit"]/h1/text()|//div[@id="artical"]/h1/text()').extract() # 文章内容 cont = response.xpath('//div[@id="main_content"]/p/text()').extract() # 新闻来源 source = response.xpath('//span[@class="ss03"]/a/text()|//div[@class="yc_tit"]/p/a/text()').extract() # 图片 image_bool = response.xpath('//p[@class="detailPic"]') if image_bool: images = response.xpath('//p[@class="detailPic"]/img/@src').extract() item['BTIT'] = title item['ZWWB'] = cont item['XWLY'] = source item['TP_URL'] = response.url yield item
def parse_datel(self, response): ''' 该方法用于处理详情页信息 通过xpath定位出相应的数据 ''' print("====", response.url) chrome_options = get_chrome_options() driver = webdriver.Chrome(executable_path=windows_chrome_driver, chrome_options=chrome_options) driver.get(response.url) htmls = etree.HTML(driver.page_source) item = FhwItem() # 文章标题 if len( htmls.xpath( '//div[@class="yc_tit"]/h1/text()|//div[@id="artical"]/h1/text()' )) > 0: title = htmls.xpath( '//div[@class="yc_tit"]/h1/text()|//div[@id="artical"]/h1/text()' )[0] else: title = "" # 正文文本 if len( htmls.xpath( '//div[@id="main_content"]/p/text()|//div[@id="main_content"]/text()' )) > 0: cont = htmls.xpath( '//div[@id="main_content"]/p/text()|//div[@id="main_content"]/text()' ) cont = ''.join(cont) else: cont = "" # 新闻来源 if len( htmls.xpath( '//span[@class="ss03"]/a/text()|//div[@class="yc_tit"]/p/a/text()' )) > 0: source = htmls.xpath( '//span[@class="ss03"]/a/text()|//div[@class="yc_tit"]/p/a/text()' )[0] else: source = "" # 栏目路径 if len( htmls.xpath( '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()' )) > 0: lm2 = htmls.xpath( '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()' )[0] else: lm2 = "" if len( htmls.xpath( '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()' )) > 0: lm1 = htmls.xpath( '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()' )[0] else: lm1 = "" lmlj = lm1 + ";" + lm2 # 参与人数 if len(htmls.xpath('//div[@class="box03"]/h5/span/a/em/text()')) > 0: cy_num = htmls.xpath( '//div[@class="box03"]/h5/span/a/em/text()')[0] else: cy_num = "" # 评论数 if len(htmls.xpath('//div[@class="box03"]/h5/a/em/text()')) > 0: pl_num = htmls.xpath('//div[@class="box03"]/h5/a/em/text()')[0] else: pl_num = "" # 编者 if len(htmls.xpath('//div[@id="artical_sth2"]/p[1]/text()')) > 0: bz = htmls.xpath('//div[@id="artical_sth2"]/p[1]/text()') else: bz = "" # 成稿时间 if len(htmls.xpath('//div[@id="artical_sth"]/p/span[1]/text()')) > 0: cg_time = htmls.xpath( '//div[@id="artical_sth"]/p/span[1]/text()')[0] else: cg_time = "" # 推荐数 if len(htmls.xpath('//div[@id="left_dz"]/span/text()')) > 0: tj_num = htmls.xpath('//div[@id="left_dz"]/span/text()')[0] else: tj_num = "" # 图片 if len( htmls.xpath( '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src' )) > 0: img_list = htmls.xpath( '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src' )[0] else: img_list = "" zwwbhtml = driver.find_element_by_xpath( '//div[@id="main_content"]').get_attribute('outerHTML') # zwwbhtml.get_attribute('outerHTML') # zwwbhtml = htmls.xpath('//div[@id="main_content"]') # print("============>>>>>>>>>>>>>", type(zwwbhtml)) # print(zwwbhtml) img_url = '//p[@class="detailPic"]/img' ZWNR = self.process_image_src(zwwbhtml, img_url) print('=ZWNR===', ZWNR, type(ZWNR)) urlpath = response.url # 这是那个 ---http://news.ifeng.com/a/20190108/60227925_0.shtml WEBSITE = r'ifeng/' + lm2 + '/html/' filename = os.path.join(WEBSITE, urlpath.split('/')[-1].strip('/')) file_dir = os.path.dirname(filename) os.makedirs(file_dir, exist_ok=True) with open(filename, 'w', encoding='utf-8', errors='ignore') as f: f.write(driver.page_source) # 标题 item['BTIT'] = title # 参与人数 item['CYRS'] = cy_num # 评论数 item['PLS'] = pl_num # 新闻来源 item['XWLY'] = source # 栏目路径 item['LMLJ'] = lmlj # 编者 item['BZ'] = bz # 成稿时间 item['CGSJ'] = cg_time # 采集时间 item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 正文文本 item['ZWWB'] = cont.replace("\n", "").replace("\r", "") # 正文内容 item['ZWNR'] = ZWNR # 推荐数 item['TJS'] = tj_num # # 原始网页链接 item['YS_URL'] = filename # # 处理网页链接 item['CL_URL'] = filename # 缩略图链接 item['TP_URL'] = img_list # url item['URL'] = urlpath item['LMLJ2'] = lm2 driver.close() yield item
def parse_item(self, response): print("====", response.url) item = FhwItem() atime = self.strUrl(response.url) if int(atime) >= 20180601: if len(response.xpath( '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()')) == 0: LMLJ2 = "" else: LMLJ2 = \ response.xpath( '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()').extract()[ 0] if LMLJ2 == "台湾" or LMLJ2 == "大陆" or LMLJ2 == "国际" or LMLJ2 == "港澳": # 标题 BTIT = response.xpath( '//div[@id="titL"]/h1/text()|//div[@class="yc_tit"]/h1/text()|//h1[@id="artical_topic"]/text()').extract()[ 0] # 参与人数,评论数,推荐数,路径 CYRS, PLS, TJS = self.handlejs(response.url) # 新闻来源 XWLY = response.xpath( '//span[@class="ss03"]/text()|//div[@id="yc_con_txt"]/p/text()|//span[@class="ss03"]/a/text()|//span[@class="ss03 weMediaicon"]/a/text()').extract()[ 0] # 编者 if len(response.xpath( '//div[@id="artical_sth2"]/p[1]/text()')) == 0: BZ = "" else: BZ = response.xpath( '//div[@id="artical_sth2"]/p[1]/text()').extract()[0] print("===bz=====>>>>>", BZ) # 栏目路径 LMLJ1 = \ response.xpath( '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()').extract()[ 0] if len(response.xpath('//span[@class="ss01"]/text()')) == 0: CGSJ = "" else: CGSJ = response.xpath('//span[@class="ss01"]/text()').extract()[0] LMLJ = "" LMLJ += LMLJ1 + ";" + LMLJ2 # 正文文本 ZWWB = response.xpath('//div[@id="main_content"]/p/text()').extract() print("======ZWWB====>>>", ZWWB) # 正文内容 ZWNR = ZWWB # 新闻缩略图的url TP_URL = response.xpath( '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src').extract()[ 0] item['BTIT'] = BTIT item['CYRS'] = CYRS item['PLS'] = PLS item['XWLY'] = XWLY item['LMLJ'] = LMLJ item['ZWWB'] = ZWWB item['BZ'] = BZ item['TJS'] = TJS item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') item['ZWNR'] = ZWNR item['TP_URL'] = TP_URL # item['YS_URL'] = filename # item['CL_URL'] = filename item['CGSJ'] = CGSJ print(item) yield item
def parse_item(self, response): print("====", response.url) item = FhwItem() # 判断新闻在20180601之后的新闻 atime = response.url if int(atime.split("/")[4]) >= 20180601: if len( response.xpath( '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()' )) == 0: LMLJ2 = "" else: LMLJ2 = \ response.xpath( '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()').extract()[ 0] if LMLJ2 == "台湾" or LMLJ2 == "大陆" or LMLJ2 == "国际" or LMLJ2 == "港澳": # 标题 BTIT = response.xpath( '//div[@id="titL"]/h1/text()|//div[@class="yc_tit"]/h1/text()|//h1[@id="artical_topic"]/text()' ).extract()[0] # 新闻来源 XWLY = response.xpath( '//span[@class="ss03"]/text()|//div[@id="yc_con_txt"]/p/text()|//span[@class="ss03"]/a/text()|//span[@class="ss03 weMediaicon"]/a/text()' ).extract()[0] # 编者 if len( response.xpath( '//div[@id="artical_sth2"]/p[1]/text()')) == 0: BZ = "" else: BZ = response.xpath( '//div[@id="artical_sth2"]/p[1]/text()').extract( )[0] # 栏目路径 LMLJ1 = \ response.xpath( '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()').extract()[ 0] if len(response.xpath( '//span[@class="ss01"]/text()')) == 0: CGSJ = "" else: CGSJ = response.xpath( '//span[@class="ss01"]/text()').extract()[0] LMLJ = "" LMLJ += LMLJ1 + ";" + LMLJ2 # 正文文本 ZWWB = response.xpath( '//div[@id="main_content"]/p/text()').extract()[0] # 正文内容 # 新闻缩略图的url TP_URL = response.xpath( '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src' ).extract()[0] item['BTIT'] = BTIT item['XWLY'] = XWLY item['LMLJ'] = LMLJ item['ZWWB'] = ZWWB item['BZ'] = BZ item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') item['ZWNR'] = ZWWB item['TP_URL'] = TP_URL item['CGSJ'] = CGSJ item['URL'] = response.url print(item) yield item
def parse_datel(self, response): ''' 该方法用于处理详情页信息 通过xpath定位出相应的数据 ''' print("====", response.url) driver = webdriver.Chrome() driver.get(response.url) htmls = etree.HTML(driver.page_source) item = FhwItem() # 文章标题 title = htmls.xpath( '//div[@class="yc_tit"]/h1/text()|//div[@id="artical"]/h1/text()' )[0] # 文章内容 cont = htmls.xpath( '//div[@id="main_content"]/p/text()|//div[@id="main_content"]/text()' ) # 新闻来源 source = htmls.xpath( '//span[@class="ss03"]/a/text()|//div[@class="yc_tit"]/p/a/text()' )[0] # 栏目路径 lm2 = htmls.xpath( '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()' )[0] lm1 = htmls.xpath( '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()' )[0] lmlj = lm1 + ";" + lm2 # 参与人数 cy_num = htmls.xpath('//div[@class="box03"]/h5/span/a/em/text()')[0] # 评论数 pl_num = htmls.xpath('//div[@class="box03"]/h5/a/em/text()')[0] # 编者 bz = htmls.xpath('//div[@id="artical_sth2"]/p[1]/text()') # 成稿时间 cg_time = htmls.xpath('//div[@id="artical_sth"]/p/span[1]/text()')[0] # 推荐数 tj_num = htmls.xpath('//div[@id="left_dz"]/span/text()')[0] # 图片 img_list = htmls.xpath( '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src' ) # 标题 item['BTIT'] = title # 参与人数 item['CYRS'] = cy_num # 评论数 item['PLS'] = pl_num # 新闻来源 item['XWLY'] = source # 栏目路径 item['LMLJ'] = lmlj # 编者 item['BZ'] = bz # 成稿时间 item['CGSJ'] = cg_time # 采集时间 item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # # 阅读量 # YDL = scrapy.Field() # # 转载量 # ZZL = scrapy.Field() # # 点赞量 # DJL = scrapy.Field() # 正文内容 item['ZWNR'] = cont # 推荐数 item['TJS'] = tj_num # # 原始网页链接 # YS_URL = scrapy.Field() # # 处理网页链接 # CL_URL = scrapy.Field() # # 缩略图链接 # TP_URL = scrapy.Field() driver.close() yield item
def parse_item(self, response): print("====", response.url) item = FhwItem() atime = self.strUrl(response.url) if int(atime) >= 20180601: if len(response.xpath( 'div[@class="speNav js_crumb"]/a[2]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()|//div[@class="speNav js_crumb"]/a[2]/text()')) > 0: LMLJ2 = \ response.xpath( 'div[@class="speNav js_crumb"]/a[2]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()|//div[@class="speNav js_crumb"]/a[2]/text()').extract()[ 0] if LMLJ2 == "台湾" or LMLJ2 == "大陆" or LMLJ2 == "国际" or LMLJ2 == "港澳": # 标题 if len(response.xpath( '//div[@id="titL"]/h1/text()|//div[@class="yc_tit"]/h1/text()|//h1[@id="artical_topic"]/text()')) > 0: BTIT = response.xpath( '//div[@id="titL"]/h1/text()|//div[@class="yc_tit"]/h1/text()|//h1[@id="artical_topic"]/text()').extract()[ 0] else: BTIT = "" CYRS, PLS, TJS = self.handlejs(response.url) if len(response.xpath( '//span[@class="ss03"]/text()|//div[@id="yc_con_txt"]/p/text()|//span[@class="ss03"]/a/text()|//span[@class="ss03 weMediaicon"]/a/text()')) > 0: XWLY = response.xpath( '//span[@class="ss03"]/text()|//div[@id="yc_con_txt"]/p/text()|//span[@class="ss03"]/a/text()|//span[@class="ss03 weMediaicon"]/a/text()').extract()[ 0] else: XWLY = "" if len(response.xpath( '//div[@id="artical_sth2"]/p[1]/text()|//div[@id="main_content"]/p[12]/text()|//span[@class="ss04"]/span/text()')) == 0: BZ = "" else: BZ = response.xpath( '//div[@id="artical_sth2"]/p[1]/text()|//div[@id="main_content"]/p[12]/text()|//span[@class="ss04"]/span/text()').extract()[ 0] # 栏目路径 if len(response.xpath( '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()')) > 0: LMLJ1 = \ response.xpath( '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()').extract()[ 0] else: LMLJ1 = "" # if len(response.xpath( # '//div[@class="speNav js_crumb"]/a[2]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()')) > 0: # LMLJ2 = \ # response.xpath( # '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()').extract()[ # 0] LMLJ = "" LMLJ += LMLJ1 + ";" + LMLJ2 ZWWB = response.xpath('//div[@id="main_content"]/p/text()').extract()[0] ZWNR = ZWWB if len(response.xpath( '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src')) > 0: TP_URL = response.xpath( '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src').extract()[ 0] else: TP_URL = "" item['BTIT'] = BTIT item['CYRS'] = int(CYRS) item['PLS'] = int(PLS) item['XWLY'] = XWLY item['LMLJ'] = LMLJ item['ZWWB'] = ZWWB item['BZ'] = BZ item['TJS'] = TJS item['CJSJ'] = datetime.now() item['ZWNR'] = ZWNR item['TP_URL'] = TP_URL item['YS_URL'] = response.url item['CL_URL'] = response.url print(item) yield item
def handlejs(self, url): item = FhwItem() driver = webdriver.Chrome() driver.get(url) LMLJ2 = driver.find_element_by_xpath( '//div[@class="theLogo"]/div/a[2]|//div[@class="h_nav"]/a[2]').text if len( driver.find_element_by_xpath( '//div[@class="theLogo"]/div/a[2]|//div[@class="h_nav"]/a[2]' ).text ) > 0 and LMLJ2 == "台湾" or LMLJ2 == "大陆" or LMLJ2 == "国际" or LMLJ2 == "港澳": CYRS = driver.find_element_by_xpath( '//em[@class="js_joinNum"][1]').text PLS = driver.find_element_by_xpath( '//em[@class="js_cmtNum"][1]').text TJS = driver.find_element_by_xpath( '//div[@id="left_dz"]/span').text BT = driver.find_element_by_xpath( '//div[@id="titL"]/h1|//div[@class="yc_tit"]/h1|//h1[@id="artical_topic"]' ).text XWLY = driver.find_element_by_xpath( '//div[@id="yc_con_txt"]/p|//span[@class="ss03"]/a|//span[@class="ss03 weMediaicon"]/a' ).text BZ = driver.find_element_by_xpath( '//div[@id="artical_sth2"]/p[1]').text LMLJ1 = driver.find_element_by_xpath( '//div[@class="speNav js_crumb"]/a[1]|//div[@class="h_nav"]/a[1]|//div[@class="theLogo"]/div/a[1]' ).text LMLJ = "" LMLJ += LMLJ1 + ";" + LMLJ2 CGSJ = driver.find_element_by_xpath('//span[@class="ss01"]').text ss = driver.find_element_by_xpath( '//p[@class="detailPic"]/img|//div[@class="yc_con_txt"]/p/img|//div[@id="main_content"]/p/img|//div[@class="box02"][1]/img' ) ZWWB = driver.find_element_by_xpath( '//div[@id="main_content"]').text TP_URL = ss.get_attribute('src') ZWNR = ZWWB + "|" + TP_URL # urlpath = self.handlerpath(url) # filename = os.path.join(PREFIX, WEBSITE, urlpath.strip('/')) # file_dir = os.path.dirname(filename) # os.makedirs(file_dir, exist_ok=True) # with open(filename, 'w', encoding='utf-8', errors='ignore') as f: # f.write(driver.page_source) item['BTIT'] = BT item['CYRS'] = CYRS item['PLS'] = PLS item['XWLY'] = XWLY item['LMLJ'] = LMLJ item['ZWWB'] = ZWWB item['BZ'] = BZ item['CGSJ'] = CGSJ item['TJS'] = TJS item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') item['ZWNR'] = ZWNR item['TP_URL'] = TP_URL item['LMLJ2'] = LMLJ2 driver.close() return item
def handlejs(self, response): item = FhwItem() chrome_options = Options() # 设置chrome浏览器无界面模式 chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options) # driver.manage().timeouts().setScriptTimeout(3, TimeUnit.SECONDS); # driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1']) driver.get(response.url) LMLJ2 = driver.find_element_by_xpath( '//div[@class="theLogo"]/div/a[2]|//div[@class="h_nav"]/a[2]').text if len( driver.find_element_by_xpath( '//div[@class="theLogo"]/div/a[2]|//div[@class="h_nav"]/a[2]' ).text ) > 0 and LMLJ2 == "台湾" or LMLJ2 == "大陆" or LMLJ2 == "国际" or LMLJ2 == "港澳": CYRS = driver.find_element_by_xpath( '//em[@class="js_joinNum"][1]').text PLS = driver.find_element_by_xpath( '//em[@class="js_cmtNum"][1]').text TJS = driver.find_element_by_xpath( '//div[@id="left_dz"]/span').text BT = driver.find_element_by_xpath( '//div[@id="titL"]/h1|//div[@class="yc_tit"]/h1|//h1[@id="artical_topic"]' ).text XWLY = driver.find_element_by_xpath( '//div[@id="yc_con_txt"]/p|//span[@class="ss03"]/a|//span[@class="ss03 weMediaicon"]/a' ).text BZ = driver.find_element_by_xpath( '//div[@id="artical_sth2"]/p[1]').text LMLJ1 = driver.find_element_by_xpath( '//div[@class="speNav js_crumb"]/a[1]|//div[@class="h_nav"]/a[1]|//div[@class="theLogo"]/div/a[1]' ).text LMLJ = "" LMLJ += LMLJ1 + ";" + LMLJ2 CGSJ = driver.find_element_by_xpath('//span[@class="ss01"]').text ss = driver.find_element_by_xpath( '//p[@class="detailPic"]/img|//div[@class="yc_con_txt"]/p/img|//div[@id="main_content"]/p/img|//div[@class="box02"][1]/img' ) ZWWB = driver.find_element_by_xpath( '//div[@id="main_content"]').text TP_URL = ss.get_attribute('src') # ZWNR = ZWWB + "|" + TP_URL urlpath = response.url # 这是那个 ---http://news.ifeng.com/a/20190108/60227925_0.shtml WEBSITE = r'ifeng/' + LMLJ2 + '/html/' filename = os.path.join(WEBSITE, urlpath.split('/')[-1].strip('/')) file_dir = os.path.dirname(filename) os.makedirs(file_dir, exist_ok=True) with open(filename, 'w', encoding='utf-8', errors='ignore') as f: f.write(driver.page_source) item['BTIT'] = BT item['CYRS'] = CYRS item['PLS'] = PLS item['XWLY'] = XWLY item['LMLJ'] = LMLJ item['ZWWB'] = ZWWB item['BZ'] = BZ item['CGSJ'] = CGSJ item['TJS'] = TJS item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') item['ZWNR'] = ZWWB item['TP_URL'] = TP_URL item['YS_URL'] = filename item['CL_URL'] = filename item['LMLJ2'] = LMLJ2 driver.close() yield item