示例#1
0
    def handlejs(self, url):
        item = FhwItem()
        driver = webdriver.Chrome()
        driver.get(url)
        LMLJ2 = driver.find_element_by_xpath('//div[@class="theLogo"]/div/a[2]|//div[@class="h_nav"]/a[2]').text
        if len(driver.find_element_by_xpath(
                '//div[@class="theLogo"]/div/a[2]|//div[@class="h_nav"]/a[2]').text) > 0 and LMLJ2 == "台湾" or LMLJ2 == "大陆" or LMLJ2 == "国际" or LMLJ2 == "港澳":
            CYRS = driver.find_element_by_xpath('//em[@class="js_joinNum"][1]').text
            PLS = driver.find_element_by_xpath('//em[@class="js_cmtNum"][1]').text
            TJS = driver.find_element_by_xpath('//div[@id="left_dz"]/span').text
            BT = driver.find_element_by_xpath(
                '//div[@id="titL"]/h1|//div[@class="yc_tit"]/h1|//h1[@id="artical_topic"]').text
            XWLY = driver.find_element_by_xpath(
                '//div[@id="yc_con_txt"]/p|//span[@class="ss03"]/a|//span[@class="ss03 weMediaicon"]/a').text
            BZ = driver.find_element_by_xpath(
                '//div[@id="artical_sth2"]/p[1]').text
            LMLJ1 = driver.find_element_by_xpath(
                '//div[@class="speNav js_crumb"]/a[1]|//div[@class="h_nav"]/a[1]|//div[@class="theLogo"]/div/a[1]').text
            LMLJ = ""
            LMLJ += LMLJ1 + ";" + LMLJ2
            CGSJ = driver.find_element_by_xpath('//span[@class="ss01"]').text
            ss = driver.find_element_by_xpath(
                '//p[@class="detailPic"]/img|//div[@class="yc_con_txt"]/p/img|//div[@id="main_content"]/p/img|//div[@class="box02"][1]/img')
            ZWWB = driver.find_element_by_xpath('//div[@id="main_content"]').text
            TP_URL = ss.get_attribute('src')
            ZWNR = ZWWB + "|" + TP_URL

            item['BTIT'] = BT
            item['CYRS'] = CYRS
            item['PLS'] = PLS
            item['XWLY'] = XWLY
            item['LMLJ'] = LMLJ
            item['ZWWB'] = ZWWB
            item['BZ'] = BZ
            item['CGSJ'] = CGSJ
            item['TJS'] = TJS
            item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            item['ZWNR'] = ZWNR
            item['TP_URL'] = TP_URL
            item['LMLJ2'] = LMLJ2

            driver.close()
            return item
示例#2
0
    def parse_datel(self, response):
        '''
        该方法用于处理详情页信息
        通过xpath定位出相应的数据

        '''
        print("====", response.url)
        item = FhwItem()
        # 文章标题
        title = response.xpath('//div[@class="yc_tit"]/h1/text()|//div[@id="artical"]/h1/text()').extract()
        # 文章内容
        cont = response.xpath('//div[@id="main_content"]/p/text()').extract()
        # 新闻来源
        source = response.xpath('//span[@class="ss03"]/a/text()|//div[@class="yc_tit"]/p/a/text()').extract()
        # 图片
        image_bool = response.xpath('//p[@class="detailPic"]')
        if image_bool:
            images = response.xpath('//p[@class="detailPic"]/img/@src').extract()

        item['BTIT'] = title
        item['ZWWB'] = cont
        item['XWLY'] = source
        item['TP_URL'] = response.url
        yield item
示例#3
0
    def parse_datel(self, response):
        '''
        该方法用于处理详情页信息
        通过xpath定位出相应的数据
        '''
        print("====", response.url)
        chrome_options = get_chrome_options()
        driver = webdriver.Chrome(executable_path=windows_chrome_driver,
                                  chrome_options=chrome_options)
        driver.get(response.url)
        htmls = etree.HTML(driver.page_source)
        item = FhwItem()

        # 文章标题
        if len(
                htmls.xpath(
                    '//div[@class="yc_tit"]/h1/text()|//div[@id="artical"]/h1/text()'
                )) > 0:
            title = htmls.xpath(
                '//div[@class="yc_tit"]/h1/text()|//div[@id="artical"]/h1/text()'
            )[0]
        else:
            title = ""
        # 正文文本
        if len(
                htmls.xpath(
                    '//div[@id="main_content"]/p/text()|//div[@id="main_content"]/text()'
                )) > 0:
            cont = htmls.xpath(
                '//div[@id="main_content"]/p/text()|//div[@id="main_content"]/text()'
            )
            cont = ''.join(cont)
        else:
            cont = ""
        # 新闻来源
        if len(
                htmls.xpath(
                    '//span[@class="ss03"]/a/text()|//div[@class="yc_tit"]/p/a/text()'
                )) > 0:
            source = htmls.xpath(
                '//span[@class="ss03"]/a/text()|//div[@class="yc_tit"]/p/a/text()'
            )[0]
        else:
            source = ""
        # 栏目路径
        if len(
                htmls.xpath(
                    '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()'
                )) > 0:
            lm2 = htmls.xpath(
                '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()'
            )[0]
        else:
            lm2 = ""
        if len(
                htmls.xpath(
                    '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()'
                )) > 0:
            lm1 = htmls.xpath(
                '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()'
            )[0]
        else:
            lm1 = ""
        lmlj = lm1 + ";" + lm2
        # 参与人数
        if len(htmls.xpath('//div[@class="box03"]/h5/span/a/em/text()')) > 0:
            cy_num = htmls.xpath(
                '//div[@class="box03"]/h5/span/a/em/text()')[0]
        else:
            cy_num = ""
        # 评论数
        if len(htmls.xpath('//div[@class="box03"]/h5/a/em/text()')) > 0:
            pl_num = htmls.xpath('//div[@class="box03"]/h5/a/em/text()')[0]
        else:
            pl_num = ""
        # 编者
        if len(htmls.xpath('//div[@id="artical_sth2"]/p[1]/text()')) > 0:
            bz = htmls.xpath('//div[@id="artical_sth2"]/p[1]/text()')
        else:
            bz = ""
        # 成稿时间
        if len(htmls.xpath('//div[@id="artical_sth"]/p/span[1]/text()')) > 0:
            cg_time = htmls.xpath(
                '//div[@id="artical_sth"]/p/span[1]/text()')[0]
        else:
            cg_time = ""
        # 推荐数
        if len(htmls.xpath('//div[@id="left_dz"]/span/text()')) > 0:

            tj_num = htmls.xpath('//div[@id="left_dz"]/span/text()')[0]
        else:
            tj_num = ""
        # 图片
        if len(
                htmls.xpath(
                    '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src'
                )) > 0:
            img_list = htmls.xpath(
                '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src'
            )[0]
        else:
            img_list = ""
        zwwbhtml = driver.find_element_by_xpath(
            '//div[@id="main_content"]').get_attribute('outerHTML')
        # zwwbhtml.get_attribute('outerHTML')
        # zwwbhtml = htmls.xpath('//div[@id="main_content"]')

        # print("============>>>>>>>>>>>>>", type(zwwbhtml))
        # print(zwwbhtml)
        img_url = '//p[@class="detailPic"]/img'
        ZWNR = self.process_image_src(zwwbhtml, img_url)
        print('=ZWNR===', ZWNR, type(ZWNR))
        urlpath = response.url  # 这是那个  ---http://news.ifeng.com/a/20190108/60227925_0.shtml
        WEBSITE = r'ifeng/' + lm2 + '/html/'
        filename = os.path.join(WEBSITE, urlpath.split('/')[-1].strip('/'))
        file_dir = os.path.dirname(filename)
        os.makedirs(file_dir, exist_ok=True)
        with open(filename, 'w', encoding='utf-8', errors='ignore') as f:
            f.write(driver.page_source)

        # 标题
        item['BTIT'] = title
        # 参与人数
        item['CYRS'] = cy_num
        # 评论数
        item['PLS'] = pl_num
        # 新闻来源
        item['XWLY'] = source
        # 栏目路径
        item['LMLJ'] = lmlj
        # 编者
        item['BZ'] = bz
        # 成稿时间
        item['CGSJ'] = cg_time
        # 采集时间
        item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        # 正文文本
        item['ZWWB'] = cont.replace("\n", "").replace("\r", "")
        # 正文内容
        item['ZWNR'] = ZWNR
        # 推荐数
        item['TJS'] = tj_num
        # # 原始网页链接
        item['YS_URL'] = filename
        # # 处理网页链接
        item['CL_URL'] = filename
        # 缩略图链接
        item['TP_URL'] = img_list
        # url
        item['URL'] = urlpath
        item['LMLJ2'] = lm2
        driver.close()
        yield item
示例#4
0
    def parse_item(self, response):
        print("====", response.url)
        item = FhwItem()
        atime = self.strUrl(response.url)
        if int(atime) >= 20180601:
            if len(response.xpath(
                    '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()')) == 0:
                LMLJ2 = ""
            else:
                LMLJ2 = \
                    response.xpath(
                        '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()').extract()[
                        0]
                if LMLJ2 == "台湾" or LMLJ2 == "大陆" or LMLJ2 == "国际" or LMLJ2 == "港澳":
                    # 标题
                    BTIT = response.xpath(
                        '//div[@id="titL"]/h1/text()|//div[@class="yc_tit"]/h1/text()|//h1[@id="artical_topic"]/text()').extract()[
                        0]
                    # 参与人数,评论数,推荐数,路径
                    CYRS, PLS, TJS = self.handlejs(response.url)
                    # 新闻来源
                    XWLY = response.xpath(
                        '//span[@class="ss03"]/text()|//div[@id="yc_con_txt"]/p/text()|//span[@class="ss03"]/a/text()|//span[@class="ss03 weMediaicon"]/a/text()').extract()[
                        0]
                    # 编者
                    if len(response.xpath(
                            '//div[@id="artical_sth2"]/p[1]/text()')) == 0:
                        BZ = ""
                    else:
                        BZ = response.xpath(
                            '//div[@id="artical_sth2"]/p[1]/text()').extract()[0]
                        print("===bz=====>>>>>", BZ)
                    # 栏目路径
                    LMLJ1 = \
                        response.xpath(
                            '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()').extract()[
                            0]

                    if len(response.xpath('//span[@class="ss01"]/text()')) == 0:
                        CGSJ = ""
                    else:
                        CGSJ = response.xpath('//span[@class="ss01"]/text()').extract()[0]
                    LMLJ = ""
                    LMLJ += LMLJ1 + ";" + LMLJ2
                    # 正文文本
                    ZWWB = response.xpath('//div[@id="main_content"]/p/text()').extract()
                    print("======ZWWB====>>>", ZWWB)
                    # 正文内容
                    ZWNR = ZWWB
                    # 新闻缩略图的url
                    TP_URL = response.xpath(
                        '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src').extract()[
                        0]
                    item['BTIT'] = BTIT
                    item['CYRS'] = CYRS
                    item['PLS'] = PLS
                    item['XWLY'] = XWLY
                    item['LMLJ'] = LMLJ
                    item['ZWWB'] = ZWWB
                    item['BZ'] = BZ
                    item['TJS'] = TJS
                    item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    item['ZWNR'] = ZWNR
                    item['TP_URL'] = TP_URL
                    # item['YS_URL'] = filename
                    # item['CL_URL'] = filename
                    item['CGSJ'] = CGSJ
                    print(item)
                    yield item
示例#5
0
    def parse_item(self, response):
        print("====", response.url)
        item = FhwItem()
        # 判断新闻在20180601之后的新闻
        atime = response.url
        if int(atime.split("/")[4]) >= 20180601:
            if len(
                    response.xpath(
                        '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()'
                    )) == 0:
                LMLJ2 = ""
            else:
                LMLJ2 = \
                    response.xpath(
                        '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()').extract()[
                        0]
                if LMLJ2 == "台湾" or LMLJ2 == "大陆" or LMLJ2 == "国际" or LMLJ2 == "港澳":
                    # 标题
                    BTIT = response.xpath(
                        '//div[@id="titL"]/h1/text()|//div[@class="yc_tit"]/h1/text()|//h1[@id="artical_topic"]/text()'
                    ).extract()[0]
                    # 新闻来源
                    XWLY = response.xpath(
                        '//span[@class="ss03"]/text()|//div[@id="yc_con_txt"]/p/text()|//span[@class="ss03"]/a/text()|//span[@class="ss03 weMediaicon"]/a/text()'
                    ).extract()[0]
                    # 编者
                    if len(
                            response.xpath(
                                '//div[@id="artical_sth2"]/p[1]/text()')) == 0:
                        BZ = ""
                    else:
                        BZ = response.xpath(
                            '//div[@id="artical_sth2"]/p[1]/text()').extract(
                            )[0]
                    # 栏目路径
                    LMLJ1 = \
                        response.xpath(
                            '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()').extract()[
                            0]

                    if len(response.xpath(
                            '//span[@class="ss01"]/text()')) == 0:
                        CGSJ = ""
                    else:
                        CGSJ = response.xpath(
                            '//span[@class="ss01"]/text()').extract()[0]
                    LMLJ = ""
                    LMLJ += LMLJ1 + ";" + LMLJ2
                    # 正文文本
                    ZWWB = response.xpath(
                        '//div[@id="main_content"]/p/text()').extract()[0]
                    # 正文内容
                    # 新闻缩略图的url
                    TP_URL = response.xpath(
                        '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src'
                    ).extract()[0]
                    item['BTIT'] = BTIT
                    item['XWLY'] = XWLY
                    item['LMLJ'] = LMLJ
                    item['ZWWB'] = ZWWB
                    item['BZ'] = BZ
                    item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    item['ZWNR'] = ZWWB
                    item['TP_URL'] = TP_URL
                    item['CGSJ'] = CGSJ
                    item['URL'] = response.url
                    print(item)
                    yield item
示例#6
0
    def parse_datel(self, response):
        '''
		该方法用于处理详情页信息
		通过xpath定位出相应的数据
		'''
        print("====", response.url)
        driver = webdriver.Chrome()
        driver.get(response.url)
        htmls = etree.HTML(driver.page_source)
        item = FhwItem()

        # 文章标题
        title = htmls.xpath(
            '//div[@class="yc_tit"]/h1/text()|//div[@id="artical"]/h1/text()'
        )[0]
        # 文章内容
        cont = htmls.xpath(
            '//div[@id="main_content"]/p/text()|//div[@id="main_content"]/text()'
        )
        # 新闻来源
        source = htmls.xpath(
            '//span[@class="ss03"]/a/text()|//div[@class="yc_tit"]/p/a/text()'
        )[0]
        # 栏目路径
        lm2 = htmls.xpath(
            '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()'
        )[0]
        lm1 = htmls.xpath(
            '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()'
        )[0]
        lmlj = lm1 + ";" + lm2
        # 参与人数
        cy_num = htmls.xpath('//div[@class="box03"]/h5/span/a/em/text()')[0]
        # 评论数
        pl_num = htmls.xpath('//div[@class="box03"]/h5/a/em/text()')[0]
        # 编者
        bz = htmls.xpath('//div[@id="artical_sth2"]/p[1]/text()')
        # 成稿时间
        cg_time = htmls.xpath('//div[@id="artical_sth"]/p/span[1]/text()')[0]
        # 推荐数
        tj_num = htmls.xpath('//div[@id="left_dz"]/span/text()')[0]
        # 图片
        img_list = htmls.xpath(
            '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src'
        )

        # 标题
        item['BTIT'] = title
        # 参与人数
        item['CYRS'] = cy_num
        # 评论数
        item['PLS'] = pl_num
        # 新闻来源
        item['XWLY'] = source
        # 栏目路径
        item['LMLJ'] = lmlj
        # 编者
        item['BZ'] = bz
        # 成稿时间
        item['CGSJ'] = cg_time
        # 采集时间
        item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        # # 阅读量
        # YDL = scrapy.Field()
        # # 转载量
        # ZZL = scrapy.Field()
        # # 点赞量
        # DJL = scrapy.Field()
        # 正文内容
        item['ZWNR'] = cont
        # 推荐数
        item['TJS'] = tj_num
        # # 原始网页链接
        # YS_URL = scrapy.Field()
        # # 处理网页链接
        # CL_URL = scrapy.Field()
        # # 缩略图链接
        # TP_URL = scrapy.Field()

        driver.close()
        yield item
示例#7
0
    def parse_item(self, response):
        print("====", response.url)
        item = FhwItem()
        atime = self.strUrl(response.url)
        if int(atime) >= 20180601:
            if len(response.xpath(
                    'div[@class="speNav js_crumb"]/a[2]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()|//div[@class="speNav js_crumb"]/a[2]/text()')) > 0:
                LMLJ2 = \
                    response.xpath(
                        'div[@class="speNav js_crumb"]/a[2]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()|//div[@class="speNav js_crumb"]/a[2]/text()').extract()[
                        0]
                if LMLJ2 == "台湾" or LMLJ2 == "大陆" or LMLJ2 == "国际" or LMLJ2 == "港澳":
                    # 标题
                    if len(response.xpath(
                            '//div[@id="titL"]/h1/text()|//div[@class="yc_tit"]/h1/text()|//h1[@id="artical_topic"]/text()')) > 0:
                        BTIT = response.xpath(
                            '//div[@id="titL"]/h1/text()|//div[@class="yc_tit"]/h1/text()|//h1[@id="artical_topic"]/text()').extract()[
                            0]
                    else:
                        BTIT = ""
                    CYRS, PLS, TJS = self.handlejs(response.url)
                    if len(response.xpath(
                            '//span[@class="ss03"]/text()|//div[@id="yc_con_txt"]/p/text()|//span[@class="ss03"]/a/text()|//span[@class="ss03 weMediaicon"]/a/text()')) > 0:
                        XWLY = response.xpath(
                            '//span[@class="ss03"]/text()|//div[@id="yc_con_txt"]/p/text()|//span[@class="ss03"]/a/text()|//span[@class="ss03 weMediaicon"]/a/text()').extract()[
                            0]
                    else:
                        XWLY = ""
                    if len(response.xpath(
                            '//div[@id="artical_sth2"]/p[1]/text()|//div[@id="main_content"]/p[12]/text()|//span[@class="ss04"]/span/text()')) == 0:
                        BZ = ""
                    else:
                        BZ = response.xpath(
                            '//div[@id="artical_sth2"]/p[1]/text()|//div[@id="main_content"]/p[12]/text()|//span[@class="ss04"]/span/text()').extract()[
                            0]
                    # 栏目路径
                    if len(response.xpath(
                            '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()')) > 0:
                        LMLJ1 = \
                            response.xpath(
                                '//div[@class="speNav js_crumb"]/a[1]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()').extract()[
                                0]
                    else:
                        LMLJ1 = ""
                    # if len(response.xpath(
                    #         '//div[@class="speNav js_crumb"]/a[2]/text()|//div[@class="h_nav"]/a[1]/text()|//div[@class="theLogo"]/div/a[1]/text()')) > 0:
                    #     LMLJ2 = \
                    #         response.xpath(
                    #             '//div[@class="theLogo"]/div/a[2]/text()|//div[@class="h_nav"]/a[2]/text()').extract()[
                    #             0]
                    LMLJ = ""
                    LMLJ += LMLJ1 + ";" + LMLJ2
                    ZWWB = response.xpath('//div[@id="main_content"]/p/text()').extract()[0]
                    ZWNR = ZWWB
                    if len(response.xpath(
                            '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src')) > 0:
                        TP_URL = response.xpath(
                            '//p[@class="detailPic"]/img/@src|//div[@class="yc_con_txt"]/p/img/@src|//div[@id="main_content"]/p/img/@src|//div[@class="box02"][1]/img/@src').extract()[
                            0]
                    else:
                        TP_URL = ""

                    item['BTIT'] = BTIT
                    item['CYRS'] = int(CYRS)
                    item['PLS'] = int(PLS)
                    item['XWLY'] = XWLY
                    item['LMLJ'] = LMLJ
                    item['ZWWB'] = ZWWB
                    item['BZ'] = BZ
                    item['TJS'] = TJS
                    item['CJSJ'] = datetime.now()
                    item['ZWNR'] = ZWNR
                    item['TP_URL'] = TP_URL
                    item['YS_URL'] = response.url
                    item['CL_URL'] = response.url
                    print(item)
                    yield item
示例#8
0
    def handlejs(self, url):
        item = FhwItem()
        driver = webdriver.Chrome()
        driver.get(url)
        LMLJ2 = driver.find_element_by_xpath(
            '//div[@class="theLogo"]/div/a[2]|//div[@class="h_nav"]/a[2]').text
        if len(
                driver.find_element_by_xpath(
                    '//div[@class="theLogo"]/div/a[2]|//div[@class="h_nav"]/a[2]'
                ).text
        ) > 0 and LMLJ2 == "台湾" or LMLJ2 == "大陆" or LMLJ2 == "国际" or LMLJ2 == "港澳":
            CYRS = driver.find_element_by_xpath(
                '//em[@class="js_joinNum"][1]').text
            PLS = driver.find_element_by_xpath(
                '//em[@class="js_cmtNum"][1]').text
            TJS = driver.find_element_by_xpath(
                '//div[@id="left_dz"]/span').text
            BT = driver.find_element_by_xpath(
                '//div[@id="titL"]/h1|//div[@class="yc_tit"]/h1|//h1[@id="artical_topic"]'
            ).text
            XWLY = driver.find_element_by_xpath(
                '//div[@id="yc_con_txt"]/p|//span[@class="ss03"]/a|//span[@class="ss03 weMediaicon"]/a'
            ).text
            BZ = driver.find_element_by_xpath(
                '//div[@id="artical_sth2"]/p[1]').text
            LMLJ1 = driver.find_element_by_xpath(
                '//div[@class="speNav js_crumb"]/a[1]|//div[@class="h_nav"]/a[1]|//div[@class="theLogo"]/div/a[1]'
            ).text
            LMLJ = ""
            LMLJ += LMLJ1 + ";" + LMLJ2
            CGSJ = driver.find_element_by_xpath('//span[@class="ss01"]').text
            ss = driver.find_element_by_xpath(
                '//p[@class="detailPic"]/img|//div[@class="yc_con_txt"]/p/img|//div[@id="main_content"]/p/img|//div[@class="box02"][1]/img'
            )
            ZWWB = driver.find_element_by_xpath(
                '//div[@id="main_content"]').text
            TP_URL = ss.get_attribute('src')
            ZWNR = ZWWB + "|" + TP_URL
            # urlpath = self.handlerpath(url)
            # filename = os.path.join(PREFIX, WEBSITE, urlpath.strip('/'))
            # file_dir = os.path.dirname(filename)
            # os.makedirs(file_dir, exist_ok=True)
            # with open(filename, 'w', encoding='utf-8', errors='ignore') as f:
            #     f.write(driver.page_source)

            item['BTIT'] = BT
            item['CYRS'] = CYRS
            item['PLS'] = PLS
            item['XWLY'] = XWLY
            item['LMLJ'] = LMLJ
            item['ZWWB'] = ZWWB
            item['BZ'] = BZ
            item['CGSJ'] = CGSJ
            item['TJS'] = TJS
            item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            item['ZWNR'] = ZWNR
            item['TP_URL'] = TP_URL
            item['LMLJ2'] = LMLJ2

            driver.close()
            return item
示例#9
0
    def handlejs(self, response):
        item = FhwItem()
        chrome_options = Options()
        # 设置chrome浏览器无界面模式
        chrome_options.add_argument('--headless')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        # driver.manage().timeouts().setScriptTimeout(3, TimeUnit.SECONDS);

        # driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
        driver.get(response.url)
        LMLJ2 = driver.find_element_by_xpath(
            '//div[@class="theLogo"]/div/a[2]|//div[@class="h_nav"]/a[2]').text
        if len(
                driver.find_element_by_xpath(
                    '//div[@class="theLogo"]/div/a[2]|//div[@class="h_nav"]/a[2]'
                ).text
        ) > 0 and LMLJ2 == "台湾" or LMLJ2 == "大陆" or LMLJ2 == "国际" or LMLJ2 == "港澳":
            CYRS = driver.find_element_by_xpath(
                '//em[@class="js_joinNum"][1]').text
            PLS = driver.find_element_by_xpath(
                '//em[@class="js_cmtNum"][1]').text
            TJS = driver.find_element_by_xpath(
                '//div[@id="left_dz"]/span').text
            BT = driver.find_element_by_xpath(
                '//div[@id="titL"]/h1|//div[@class="yc_tit"]/h1|//h1[@id="artical_topic"]'
            ).text
            XWLY = driver.find_element_by_xpath(
                '//div[@id="yc_con_txt"]/p|//span[@class="ss03"]/a|//span[@class="ss03 weMediaicon"]/a'
            ).text
            BZ = driver.find_element_by_xpath(
                '//div[@id="artical_sth2"]/p[1]').text
            LMLJ1 = driver.find_element_by_xpath(
                '//div[@class="speNav js_crumb"]/a[1]|//div[@class="h_nav"]/a[1]|//div[@class="theLogo"]/div/a[1]'
            ).text
            LMLJ = ""
            LMLJ += LMLJ1 + ";" + LMLJ2
            CGSJ = driver.find_element_by_xpath('//span[@class="ss01"]').text
            ss = driver.find_element_by_xpath(
                '//p[@class="detailPic"]/img|//div[@class="yc_con_txt"]/p/img|//div[@id="main_content"]/p/img|//div[@class="box02"][1]/img'
            )
            ZWWB = driver.find_element_by_xpath(
                '//div[@id="main_content"]').text
            TP_URL = ss.get_attribute('src')
            # ZWNR = ZWWB + "|" + TP_URL
            urlpath = response.url  # 这是那个  ---http://news.ifeng.com/a/20190108/60227925_0.shtml
            WEBSITE = r'ifeng/' + LMLJ2 + '/html/'
            filename = os.path.join(WEBSITE, urlpath.split('/')[-1].strip('/'))
            file_dir = os.path.dirname(filename)
            os.makedirs(file_dir, exist_ok=True)
            with open(filename, 'w', encoding='utf-8', errors='ignore') as f:
                f.write(driver.page_source)

            item['BTIT'] = BT
            item['CYRS'] = CYRS
            item['PLS'] = PLS
            item['XWLY'] = XWLY
            item['LMLJ'] = LMLJ
            item['ZWWB'] = ZWWB
            item['BZ'] = BZ
            item['CGSJ'] = CGSJ
            item['TJS'] = TJS
            item['CJSJ'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            item['ZWNR'] = ZWWB
            item['TP_URL'] = TP_URL
            item['YS_URL'] = filename
            item['CL_URL'] = filename
            item['LMLJ2'] = LMLJ2

            driver.close()
            yield item