Exemplo n.º 1
0
class XueqianSpider(scrapy.Spider):
    name = "news_eastday_com_gd2008_society_63"
    # allowed_domains = ["news.eastday.com"]
    start_urls = [
        "http://news.eastday.com/gd2008/society/index.html",
        "http://news.eastday.com/eastday/13news/auto/news/society/index_K33.html"
    ]
    custom_settings = {"DOWNLOAD_DELAY": 0.2}
    class_id = 63
    num = 1
    items = EduInformationItem()
    flags = True
    bf = BloomFilter()
    next_index = ""
    header = {
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        "Accept - Encoding":
        "gzip, deflate",
        "Accept - Language":
        "zh-CN,zh;q=0.9",
        "Cache - Control":
        "no - cache",
        # "Connection": "keep - alive",
        "Host":
        "news.eastday.com",
        "Pragma":
        "no - cache",
        "Referer":
        "http://news.eastday.com",
        "Upgrade - Insecure - Requests":
        1,
        "User - Agent":
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
    }

    def parse(self, response):
        node_obj = response.xpath(
            '''//div[@id="left"]/ul/li|//div[@class="leftsection"]/ul/li''')
        if not node_obj:
            print("error_spider", self.name)
        for detail in node_obj:
            url = detail.xpath('a/@href').extract_first()
            time_node = detail.xpath(
                'span[@class="hui12"]/text()|span[@class="black12 fr text4"]/text()'
            ).extract_first(default="").strip()
            url = urljoin(response.url, url)
            if url == None or url == "":
                pass
            else:
                if BL:
                    if self.bf.isContains(url):  # 判断字符串是否存在
                        print('url exists!')
                    else:
                        self.bf.insert(url)
                        print("请求详情页:", url)
                        yield scrapy.Request(url,
                                             callback=self.parse_detail,
                                             headers=self.header,
                                             meta={"time_node": time_node})
                else:
                    yield scrapy.Request(url,
                                         callback=self.parse_detail,
                                         headers=self.header,
                                         meta={"time_node": time_node})

        # # # 多页
        # next_node = response.xpath('''//div[@class="plist"]/div/a[contains(text(),"下一页")]/@href''').extract_first()
        # if next_node != None:
        #     next_page = urljoin(response.url,next_node)
        #     print("请求下页链接:",next_page)
        #     yield scrapy.Request(next_page, callback=self.parse)

    def parse_detail(self, response):

        #标题title
        title = response.xpath('//div[@id="biaoti"]/text()').extract_first(
            default="")
        title = title.strip()
        title = title_slice(title)
        #关键字keyman
        keyman = response.xpath(
            '''//meta[@name="keywords"]/@content''').extract_first(default="")
        if keyman:

            keyman = keyman_slice(keyman)
        else:
            keyman = ""

        if title:
            #简介summary
            try:
                summary = response.xpath('//meta[@name="description"]/@content'
                                         ).extract_first(default="").strip()
                summary = summary.replace("东方网-东方新闻-", "")
            except Exception as e:
                summary = ""
            summary = summay_slice(summary)
            index_node = response.xpath(
                'string(//div[@class="time grey12a fc lh22"]/p[last()])'
            ).extract_first()

            try:
                time_node = response.meta.get("time_node", "")
                time_node = time_node.replace("/", "-")
                news_time = datetime.datetime.strptime(
                    str(time_node).strip(), "%Y-%m-%d %H:%M:%S")
                news_time = int(time.mktime(news_time.timetuple()))
            except Exception as e:
                print(e, "time")
                news_time = None

# '来源:新华社 作者:胡浩 林晖 朱基钗 史竞男 选稿:刘晓晶 '
#writer作者
            try:
                writer = re.search(r".*?作者:(.*?)选稿:.*?", index_node,
                                   re.S).group(1)
                writer = writer.strip()
            except Exception as e:
                print(e, "writer")
                writer = writer_defined
            writer = writer_slice(writer)
            # 新闻来源news_source
            try:
                source = re.search(r".*?来源:(.*?)作者:.*?", index_node,
                                   re.S).group(1)
                source = source.strip()
            except Exception as e:
                try:
                    source = re.search(r".*?来源:(.*?)选稿:.*?", index_node,
                                       re.S).group(1)
                    source = source.strip()
                except Exception as e:
                    try:
                        source = re.search(r".*?来源:(.*)", index_node,
                                           re.S).group(1)
                        source = source.strip()
                    except Exception as e:
                        print(e, "source")
                        source = news_source_defined
            news_source = news_source_slice(source)

            #新闻内容content

            content = response.xpath('//div[@id="zw"]').extract_first()
            content = content.replace(" ", "")
            content = content.replace("&nbsp", "")
            content = content.replace("&nbsp&nbsp&nbsp&nbsp", "")
            content = content.replace("&", "")
            content = content.replace("nbsp", "")
            content = content.replace("&nbsp", "")
            content = contentfilter(content)
            self.items["news_keyman"] = keyman
            self.items["title"] = title
            self.items["content"] = content
            self.items['content_summary'] = summary
            self.items['click_num'] = click_num
            self.items['news_time'] = news_time
            self.items['news_source'] = news_source
            self.items['writer'] = writer
            #
            #
            self.items["class_id"] = self.class_id
            self.items["user_id"] = user_id
            self.items["istop"] = istop
            self.items["ismember"] = ismember
            self.items["userfen"] = userfen
            self.items["isgood"] = isgood
            self.items["user_name"] = "admin"
            self.items["group_id"] = group_id
            self.items["plnum"] = plnum
            self.items["first_title"] = first_title
            self.items["is_qf"] = is_qf
            self.items["totaldown"] = totaldown
            self.items["have_html"] = have_html
            self.items["last_dotime"] = int(time.time())
            self.items["diggtop"] = diggtop
            self.items["stb"] = stb
            self.items["ttid"] = ttid
            self.items["ispic"] = ispic
            self.items["isurl"] = isurl
            self.items["fstb"] = fstb
            self.items["restb"] = restb
            self.items["news_tem_pid"] = news_tem_pid
            self.items["dokey"] = dokey
            self.items["closepl"] = closepl
            self.items["haveaddfen"] = haveaddfen
            self.items["infotags"] = keyman
            self.items["checked"] = checked
            self.items["keyid"] = keyid
            self.items["news_path"] = news_path
            self.items["titlepic"] = titlepic
            self.items["ftitle"] = ftitle
            #
            #
            self.items['filename'] = filename
            self.items['titlefont'] = titlefont
            self.items['title_url_z'] = title_url_z
            self.items['originalurl'] = response.url
            #
            yield self.items
class XueqianSpider(scrapy.Spider):
    name = "mobile_zol_com_cn_more_3_506_85"
    # allowed_domains = ["eol.cn"]
    start_urls = ["http://mobile.zol.com.cn/more/3_506.shtml"]
    # custom_settings = {"DOWNLOAD_DELAY": 0.3}
    class_id = 85
    num = 0
    items = EduInformationItem()
    flags = True
    page_count = 1
    bf = BloomFilter()
    next_index = ""

    def parse(self, response):

        node_obj = response.xpath(
            '''//ul[contains(@class,"content-list")]/li''')
        if not node_obj:
            print("error_spider", self.name)
        for detail in node_obj:
            url = detail.xpath('div/a/@href').extract_first()
            # new_time = detail.xpath('//ul[@class="ysh-test-list"]/li/p[contains(@class,"tickling")]/text()').extract_first()
            titlepic_image = detail.xpath('''div/a/img/@src''').extract_first()
            # print(titlepic_images)
            if not titlepic_image:
                titlepic_images = detail.xpath('''div/a/img''').extract_first()
                titlepic_image = re.search(
                    '''<img.*?onerror="javascript:this\.src=.*?\.src=(.*?\.([Jj][pP][gG]|[Pp][Nn][gG])).*?>''',
                    titlepic_images).group(1)
            url = urljoin(response.url, url)
            if url == None or url == "":
                pass
            else:
                if BL:
                    if self.bf.isContains(url):  # 判断字符串是否存在
                        print('url exists!', url)
                    else:
                        self.bf.insert(url)
                        print("请求详情页:", url)
                        yield scrapy.Request(
                            url,
                            callback=self.parse_detail,
                            meta={"titlepic_image": titlepic_image})
                else:
                    yield scrapy.Request(
                        url,
                        callback=self.parse_detail,
                        meta={"titlepic_image": titlepic_image})

        #
        # # # # 多页

        next_node = response.xpath(
            '''//div[@class="page"]/a[contains(text(),"下一页")]/@href'''
        ).extract_first()
        next_page = urljoin(response.url, next_node)
        if next_node and self.num <= 10:
            print("请求下页链接:", next_page)
            self.num += 1
            yield scrapy.Request(next_page, callback=self.parse)

    def parse_detail(self, response):

        #标题title
        title = response.xpath('//h1/text()').extract_first(default="")

        #关键字keyman
        keyman = response.xpath(
            '''//meta[@name="keywords"]/@content''').extract_first(default="")
        if keyman:
            keyman = keyman_slice(keyman)
        else:
            keyman = ""

        if title:

            title = title_slice(title)
            #简介summary
            try:
                summary = response.xpath('//meta[@name="description"]/@content'
                                         ).extract_first(default="")
            except Exception as e:
                summary = ""
            summary = summay_slice(summary)

            titlepic_image = response.meta.get("titlepic_image", "")

            index_node = response.xpath(
                '''string(//div[@class="article-aboute"])''').extract_first()
            try:
                time_node = response.xpath(
                    '''//div[@class="article-aboute"]/span[@id="pubtime_baidu"]/text()'''
                ).extract_first()
                news_time = datetime.datetime.strptime(
                    str(time_node).strip(), "%Y-%m-%d %H:%M:%S")
                news_time = int(time.mktime(news_time.timetuple()))
            except Exception as e:
                print("time", e)
                news_time = None

                # writer作者
            writer = writer_defined
            try:
                writer = response.xpath(
                    '''string(//div[@class="article-aboute"]/span[@id="author_baidu"])'''
                ).extract_first()
                writer = writer.replace("作者:", "")
                writer = writer.strip()
            except Exception as e:
                print(e, "writer")
                writer = writer_defined
            writer = writer_slice(writer)
            # 新闻来源news_source
            news_source = news_source_defined
            try:
                source = response.xpath(
                    '''//div[@class="article-aboute"]/span[@id="source_baidu"]/text()'''
                ).extract_first()
                source = source.replace("[", "").replace("]", "")
                source = source.strip()
            except Exception as e:
                print(e, "source")
                source = news_source_defined
            news_source = news_source_slice(source)

            #新闻内容content
            content = response.xpath(
                '//div[@id="article-content"]').extract_first()
            content = content.replace("&nbsp", "")
            content = content.replace("&nbsp;", "")
            content = content.replace("&nbsp&nbsp&nbsp&nbsp", "")
            content = content.replace("&amp;", "")
            content = content.replace("nbsp", "")
            content = content.replace("&amp;nbsp", "")
            content = contentfilter(content)
            self.items["news_keyman"] = keyman
            self.items["title"] = title
            self.items["content"] = content
            self.items['content_summary'] = summary
            self.items['click_num'] = click_num
            self.items['news_time'] = news_time
            self.items['news_source'] = news_source
            self.items['writer'] = writer
            #
            #
            self.items["class_id"] = self.class_id
            self.items["user_id"] = user_id
            self.items["istop"] = istop
            self.items["ismember"] = ismember
            self.items["userfen"] = userfen
            self.items["isgood"] = isgood
            self.items["user_name"] = "admin"
            self.items["group_id"] = group_id
            self.items["plnum"] = plnum
            self.items["first_title"] = first_title
            self.items["is_qf"] = is_qf
            self.items["totaldown"] = totaldown
            self.items["have_html"] = have_html
            self.items["last_dotime"] = int(time.time())
            self.items["diggtop"] = diggtop
            self.items["stb"] = stb
            self.items["ttid"] = ttid
            self.items["ispic"] = ispic
            self.items["isurl"] = isurl
            self.items["fstb"] = fstb
            self.items["restb"] = restb
            self.items["news_tem_pid"] = news_tem_pid
            self.items["dokey"] = dokey
            self.items["closepl"] = closepl
            self.items["haveaddfen"] = haveaddfen
            self.items["infotags"] = keyman
            self.items["checked"] = checked
            self.items["keyid"] = keyid
            self.items["news_path"] = news_path
            self.items["titlepic"] = titlepic_image
            self.items["ftitle"] = ftitle
            #
            #
            self.items['filename'] = filename
            self.items['titlefont'] = titlefont
            self.items['title_url_z'] = title_url_z
            self.items['originalurl'] = response.url
            #
            yield self.items
Exemplo n.º 3
0
class XueqianSpider(scrapy.Spider):
    name = "sports_huanqiu_com_others_zh_67"
    # allowed_domains = ["eol.cn"]
    start_urls = [
        "http://sports.huanqiu.com/others/zh/",
        "http://sports.huanqiu.com/basketball/nba/",
        "http://sports.huanqiu.com/basketball/cba/",
        "http://sports.huanqiu.com/soccer/gn/",
        "http://sports.huanqiu.com/soccer/xj/",
        "http://sports.huanqiu.com/soccer/yc/"
    ]
    # custom_settings = {"DOWNLOAD_DELAY": 0.3}
    class_id = 67
    num = 0
    items = EduInformationItem()
    flags = True
    page_count = 1
    bf = BloomFilter()
    next_index = ""

    def parse(self, response):

        node_obj = response.xpath('''//div[@class="fallsFlow"]/ul/li''')
        if not node_obj:
            print("error_spider", self.name)
        for detail in node_obj:
            url = detail.xpath('h3/a/@href').extract_first()

            # new_time = detail.xpath('p[contains(@class,"time")]/text()').extract_first()
            titlepic_image = detail.xpath('''a/img/@src''').extract_first(
                default="")
            # if str(titlepic_image).startswith("//"):
            #     if str(titlepic_image).endswith("gif"):
            #         titlepic_image =detail.xpath('''a/img/@data-original''').extract_first()
            #         titlepic_image = 'https:' + titlepic_image
            #     else:
            #
            #         titlepic_image = 'https:'+titlepic_image
            # if not titlepic_image:
            #     titlepic_images = detail.xpath('''div/a/img''').extract_first()
            #     titlepic_image = re.search('''<img.*?onerror="javascript:this\.src=.*?\.src=(.*?\.([Jj][pP][gG]|[Pp][Nn][gG])).*?>''',titlepic_images).group(1)
            url = urljoin(response.url, url)
            if url == None or url == "":
                pass

            else:
                if BL:
                    if self.bf.isContains(url):  # 判断字符串是否存在
                        print('url exists!', url)
                    else:
                        self.bf.insert(url)
                        print("请求详情页:", url)
                        yield scrapy.Request(
                            url,
                            callback=self.parse_detail,
                            meta={"titlepic_image": titlepic_image})
                else:

                    yield scrapy.Request(
                        url,
                        callback=self.parse_detail,
                        meta={"titlepic_image": titlepic_image})

# '''http://china.huanqiu.com/article/2.html'''
#         next_node = response.xpath('''//div[@id="pages"]/a[contains(text(),"下一页")]/@href''').extract_first()
#         next_page = urljoin(response.url, next_node)
#         try:
#             page = re.search(r".*/(\d+)\.html",next_page).group(1)
#         except Exception as e:
#             pass
#
#         if next_node and int(page)<=2 and self.num<=200:
#             print("请求下页链接:",next_page)
#             self.num += 1
#             yield scrapy.Request(next_page, callback=self.parse)

    def parse_detail(self, response):

        #标题title
        title = response.xpath('//h1/text()').extract_first(default="")

        #关键字keyman
        keyman = response.xpath(
            '''//meta[@name="keywords"]/@content|//meta[@name="Keywords"]/@content'''
        ).extract_first(default="")
        if keyman:
            keyman = keyman_slice(keyman)
        else:
            keyman = ""

        if title:

            title = title_slice(title)
            #简介summary
            try:
                summary = response.xpath(
                    '//meta[@name="description"]/@content|//meta[@name="Description"]/@content'
                ).extract_first(default="")
            except Exception as e:
                summary = ""
            summary = summay_slice(summary)

            titlepic_image = response.meta.get("titlepic_image", "")
            index_node = response.xpath(
                '''//span[@class="la_t_a"]/text()''').extract_first()
            try:

                time_node = re.search(
                    r".*?(\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}).*?",
                    index_node, re.S).group(1)
                time_node = time_node.strip()
                time_node = time_node.replace("年", "-").replace("月",
                                                                "-").replace(
                                                                    "日", "")
                time_node = time_node + ":00"
                news_time = datetime.datetime.strptime(
                    str(time_node).strip(), "%Y-%m-%d %H:%M:%S")
                news_time = int(time.mktime(news_time.timetuple()))
            except Exception as e:
                print(e, "time")
                news_time = None
                '2016年04月13日 09:42 来源:深圳中原地产网 作者: 中原地产'

                # writer作者

            writer = writer_defined
            source = response.xpath(
                '''string(//span[@class="la_t_b"])''').extract_first(
                    default=news_source_defined)
            source = source.strip()
            news_source = news_source_slice(source)

            #新闻内容content
            content = response.xpath('//div[@class="la_con"]').extract_first()
            content = content.replace("【环球时报综合报道】", "")
            content = content.replace("【环球网体育频道】", "")
            content = content.replace("&nbsp;", "")
            content = content.replace("&nbsp&nbsp&nbsp&nbsp", "")
            content = content.replace("&amp;", "")
            content = content.replace("nbsp", "")
            content = content.replace("&amp;nbsp", "")
            content = contentfilter(content)
            self.items["news_keyman"] = keyman
            self.items["title"] = title
            self.items["content"] = content
            self.items['content_summary'] = summary
            self.items['click_num'] = click_num
            self.items['news_time'] = news_time
            self.items['news_source'] = news_source
            self.items['writer'] = writer
            #
            #
            self.items["class_id"] = self.class_id
            self.items["user_id"] = user_id
            self.items["istop"] = istop
            self.items["ismember"] = ismember
            self.items["userfen"] = userfen
            self.items["isgood"] = isgood
            self.items["user_name"] = "admin"
            self.items["group_id"] = group_id
            self.items["plnum"] = plnum
            self.items["first_title"] = first_title
            self.items["is_qf"] = is_qf
            self.items["totaldown"] = totaldown
            self.items["have_html"] = have_html
            self.items["last_dotime"] = int(time.time())
            self.items["diggtop"] = diggtop
            self.items["stb"] = stb
            self.items["ttid"] = ttid
            self.items["ispic"] = ispic
            self.items["isurl"] = isurl
            self.items["fstb"] = fstb
            self.items["restb"] = restb
            self.items["news_tem_pid"] = news_tem_pid
            self.items["dokey"] = dokey
            self.items["closepl"] = closepl
            self.items["haveaddfen"] = haveaddfen
            self.items["infotags"] = keyman
            self.items["checked"] = checked
            self.items["keyid"] = keyid
            self.items["news_path"] = news_path
            self.items["titlepic"] = titlepic_image
            self.items["ftitle"] = ftitle
            #
            #
            self.items['filename'] = filename
            self.items['titlefont'] = titlefont
            self.items['title_url_z'] = title_url_z
            self.items['originalurl'] = response.url
            #
            yield self.items
class XueqianSpider(scrapy.Spider):
    name = "news_eastday_com_gd2008_finance_66"
    # allowed_domains = ["news.eastday.com"]
    start_urls = ["http://news.eastday.com/eastday/13news/auto/news/finance/index_K47.html"]
    custom_settings = {"DOWNLOAD_DELAY": 0.2}
    class_id = 66
    num = 1
    items = EduInformationItem()
    flags = True
    bf = BloomFilter()
    next_index = ""

    def parse(self, response):
        node_obj = response.xpath('''//div[@id="left"]/ul/li|//div[@class="leftsection"]/ul/li''')
        if not node_obj:
            print("error_spider",self.name)
        for detail in node_obj:
            url = detail.xpath('a/@href').extract_first()
            url = urljoin(response.url, url)
            if url == None or url =="":
                pass
            else:
                if BL:
                    if self.bf.isContains(url):  # 判断字符串是否存在
                        print('url exists!')
                    else:
                        self.bf.insert(url)
                        print("请求详情页:",url)
                        yield scrapy.Request(url,callback=self.parse_detail)
                else:
                    yield scrapy.Request(url, callback=self.parse_detail)
        # # # 多页
        # next_node = response.xpath('''//div[@class="plist"]/div/a[contains(text(),"下一页")]/@href''').extract_first()
        # if next_node != None:
        #     next_page = urljoin(response.url,next_node)
        #     print("请求下页链接:",next_page)
        #     yield scrapy.Request(next_page, callback=self.parse)

    def parse_detail(self,response):

        #标题title
        title =  response.xpath('//h1/text()').extract_first(default="")
        title = title.strip()
        title = title_slice(title)
        #关键字keyman
        keyman = response.xpath('''//meta[@name="keywords"]/@content''').extract_first(default="")
        if keyman:

            keyman = keyman_slice(keyman)
        else:
            keyman = ""

        if title:
            #简介summary
            try:
                summary = response.xpath('//meta[@name="description"]/@content').extract_first(default="").strip()
                summary = summary.replace("东方网-东方财经-", "")
            except Exception as e:
                summary = ""
            summary = summay_slice(summary)
            index_node = response.xpath('string(//div[@class="time grey12a fc lh22"])').extract_first()

            try:
                time_node = re.search(r".*?(\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}).*?", index_node,re.S).group(1)
                time_node = time_node.strip()
                time_node = time_node.replace("/","-")
                news_time = datetime.datetime.strptime(str(time_node).strip(),"%Y-%m-%d %H:%M:%S")
                news_time = int(time.mktime(news_time.timetuple()))
            except Exception as e:
                print(e,"time")
                news_time = None

# '来源:新华社 作者:胡浩 林晖 朱基钗 史竞男 选稿:刘晓晶 '
            #writer作者
            try:
                writer = re.search(r".*?作者:(.*?)选稿:.*?", index_node,re.S).group(1)
                writer = writer.strip()
            except Exception as e:
                print(e,"writer")
                writer = writer_defined
            writer = writer_slice(writer)
            # 新闻来源news_source
            try:
                source = re.search(r".*?来源:(.*?)作者:.*?", index_node,re.S).group(1)
                source = source.strip()
            except Exception as e:
                try:
                    source = re.search(r".*?来源:(.*?)选稿:.*?", index_node, re.S).group(1)
                    source = source.strip()
                except Exception as e:
                    try:
                        source = re.search(r".*?来源:(.*)", index_node, re.S).group(1)
                        source = source.strip()
                    except Exception as e:
                        print(e,"source")
                        source = news_source_defined
            news_source = news_source_slice(source)
            #新闻内容content
            content = response.xpath('//div[@id="zw"]').extract_first()
            content = content.replace("&nbsp", "")
            content = content.replace("&nbsp&nbsp&nbsp&nbsp", "")
            content = content.replace("&amp;", "")
            content = content.replace("nbsp", "")
            content = content.replace("&amp;nbsp", "")
            content  = contentfilter(content)
            self.items["news_keyman"] = keyman
            self.items["title"] = title
            self.items["content"] = content
            self.items['content_summary'] = summary
            self.items['click_num'] = click_num
            self.items['news_time'] = news_time
            self.items['news_source'] = news_source
            self.items['writer'] = writer
            #
            #
            self.items["class_id"] = self.class_id
            self.items["user_id"] = user_id
            self.items["istop"] = istop
            self.items["ismember"] = ismember
            self.items["userfen"] = userfen
            self.items["isgood"] = isgood
            self.items["user_name"] = "admin"
            self.items["group_id"] = group_id
            self.items["plnum"] = plnum
            self.items["first_title"] = first_title
            self.items["is_qf"] = is_qf
            self.items["totaldown"] = totaldown
            self.items["have_html"] = have_html
            self.items["last_dotime"] = int(time.time())
            self.items["diggtop"] = diggtop
            self.items["stb"] = stb
            self.items["ttid"] = ttid
            self.items["ispic"] = ispic
            self.items["isurl"] = isurl
            self.items["fstb"] = fstb
            self.items["restb"] = restb
            self.items["news_tem_pid"] = news_tem_pid
            self.items["dokey"] = dokey
            self.items["closepl"] = closepl
            self.items["haveaddfen"] = haveaddfen
            self.items["infotags"] = keyman
            self.items["checked"] = checked
            self.items["keyid"] = keyid
            self.items["news_path"] = news_path
            self.items["titlepic"] = titlepic
            self.items["ftitle"] = ftitle
            #
            #
            self.items['filename'] = filename
            self.items['titlefont'] = titlefont
            self.items['title_url_z'] = title_url_z
            self.items['originalurl'] = response.url
            #
            yield self.items