示例#1
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"晨哨网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.morningwhistle.com"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("/html/body/div[@class='con-w p-t-xxl']/div[@class='list-cont con-w m-t-md']/div[@class='list-l']/div[@class='cont-t']/div[@class='t-list']//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//h1//text()")

        try:
            update_time = sel.xpath("/html/body/div[@class='con-w p-t-xxl']/div[@class='list-cont con-w m-t-md']/div[@class='list-l']/div[@class='cont-t']/p[@class='msg']/span[3]//text()")[0].extract().encode("utf8")
            item["update_time"] = datetime.strptime(update_time, "%Y-%m-%d %H:%M:%S")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)
        
        return item
示例#2
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"商务部日常新闻"
        item["coll"] = getItemColl(self.name)
        item["host"] = "mofcom.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("/html/body/div[@id='wrap']/div[@class='MainList']/div[@class='article_new mt10']/div[@id='zoom']//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "/html/body/div[@id='wrap']/div[@class='MainList']/div[@class='article_new mt10']/h4[@id='artitle']//text()")
        set_help(item, "source", "/html/body/div[@id='wrap']/div[@class='MainList']/div[@class='article_new mt10']/div[@id='arsource']/a//text()")

        try:
            item["update_time"] = datetime.strptime(re.search('var tm = \"(.*?)\"', response.body).groups()[0], "%Y-%m-%d %H:%M:%S")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)
        return item
示例#3
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf-8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()
        # init item
        item["name"] = u"中新网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "chinanews.com"
        item["link"] = response.url

        item["content"] = "<br />".join([
            n.extract().encode("utf-8")
            for n in sel.xpath("//div[@id='cont_1_1_2']//text()")
        ])
        item["summary"] = ""

        item["source"] = ""
        item["author"] = ""
        set_help(item, "title", "//h1//text()")

        return item
示例#4
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"新浪财经-国内财经"
        item["coll"] = getItemColl(self.name)
        item["host"] = "finance.sina.com.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8")
            for n in sel.xpath("//*[@id='artibody']/p")
        ])
        try:
            update_time = sel.xpath(
                "//*[@id='wrapOuter']//*[@class='time-source']/text()"
            )[0].extract().encode("utf8").strip()
            item["update_time"] = datetime.strptime(
                update_time,
                "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5\xc2\xa0%H:%M")
        except Exception as e:
            pass

        item["summary"] = ""
        item['author'] = ""
        set_help(item, "title", "//*[@id='artibodyTitle']/text()")
        set_help(item, "source", "//*[@data-sudaclick='media_name']/text()")

        return item
示例#5
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"财政部"
        item["coll"] = getItemColl(self.name)
        item["host"] = "mof.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join(
            [n.extract().encode("utf8") for n in sel.xpath("//td//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//td[@id='Zoom']//text()")
        if len(item["title"].strip()) < 2:
            set_help(item, "title", "//td[@class='font_biao1']//text()")

        item["source"] = ""

        return item
示例#6
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"中国证券网-上市公司公告快讯"
        item["coll"] = getItemColl(self.name)
        item["host"] = "ggjd.cnstock.com"
        item["link"] = response.url
        item["content"] = "<br />".join(
            [n.extract().encode("utf8") for n in sel.xpath("//p")])
        item["summary"] = ""
        set_help(item, "author",
                 "//*[@id='pager-content']/div[1]/span[3]/text()")
        set_help(item, "title", "//*[@id='pager-content']/h1/text()")
        set_help(item, "source",
                 "//*[@id='pager-content']/div[1]/span[2]/a/text()")
        try:
            pattern = re.compile(
                "(\d{4})-(\d{2})-(\d{2}).*(\d{2}):(\d{2}):\d{2}")
            update_time = sel.xpath(
                '//*[@id="pager-content"]/div[1]/span[1]/text()')[0].extract()
            match = pattern.search(update_time)
            item["update_time"] = datetime.strptime(match.group(),
                                                    "%Y-%m-%d %H:%M:%S")
        except Exception as e:
            raise DropItem(str(e))

        return item
示例#7
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"工信部"
        item["coll"] = getItemColl(self.name)
        item["host"] = "miit.com"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("/html/body/div[@class='w980 center cmain']/div[@id='con_con']//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "/html/body/div[@class='w980 center cmain']/div[@class='ctitle']/h1[@id='con_title']//text()")
        set_help(item, "source", "//div[@id='artInfo']/a/text()")

        try:
            update_time = sel.xpath("/html/body/div[@class='w980 center cmain']/div[@class='cinfo center']/span[@id='con_time']//text()")[0].extract().encode("utf8")[15:]
            item["update_time"] = datetime.strptime(update_time, "%Y-%m-%d")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)
        return item
示例#8
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"中国政府网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "/html/body[@class='body_bg']/div[@class='wrap']/div[@class='frame-pane']/div[@class='article-colum']/div[@id='UCAP-CONTENT']/table[@id='printContent']//text()"
            )
        ])
        item["summary"] = ""
        item["author"] = ""
        set_help(
            item, "title",
            "/html/body[@class='body_bg']/div[@class='wrap']/div[@class='frame-pane']/div[@class='article-colum']/div[@class='pages-title']//text()"
        )
        set_help(
            item, "source",
            "/html/body[@class='body_bg']/div[@class='wrap']/div[@class='frame-pane']/div[@class='article-colum']/div[@class='pages-date']/span[@class='font'][2]//text()"
        )

        return item
示例#9
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf-8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"证券时报"
        item["coll"] = getItemColl(self.name)
        item["host"] = "kuaixun.stcn.com"
        item["link"] = response.url

        item["content"] = "<br />".join([n.extract().encode("utf-8") for n in sel.xpath("//p")])
        item["summary"] = ""

        item["source"] = sel.xpath("//div[@class='intal_tit']/div[@class='info']/text()")[0].extract().encode("utf-8").split("\xef\xbc\x9a")[1]
        item["author"] = ""
        set_help(item, "title", "//div[@class='intal_tit']/h2/text()")
        try:
            pattern = re.compile(".*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})")
            update_time = sel.xpath("//div[@class='intal_tit']/div[@class='info']/text()")[0].extract()
            match = pattern.search(update_time)
            item["update_time"] = datetime.strptime(
                match.group(1),
                "%Y-%m-%d %H:%M"
            )
        except Exception as e:
            raise DropItem("get update_time failed")

        return item
示例#10
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"发改委"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.ndrc.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "/html/body[@class='body_bg1']/div[@id='wrapper']/div[@id='out-content']/div[@class='index_wrapper1 screen_width clearfix']/div[@class='Middle4']/div[@class='Middle4_body']/div[@id='zoom']/div[@class='TRS_Editor']/p//text()"
            )
        ])
        item["summary"] = ""
        item["author"] = ""
        set_help(
            item, "title",
            "/html/body[@class='body_bg1']/div[@id='wrapper']/div[@id='out-content']/div[@class='index_wrapper1 screen_width clearfix']/div[@class='Middle4']/div[@class='Middle4_body']/div[@id='zoom']/div[@class='TRS_Editor']/p[1]/font//text()"
        )

        return item
示例#11
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"中国政府网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("/html/body[@class='body_bg']/div[@class='wrap']/table[2]/tbody/tr/td[1]/table/tbody/tr/td/table[1]/tbody/tr[1]/td[@class='b12c']/p[2]//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "/html/body[@class='body_bg']/div[@class='wrap']/table[1]/tbody/tr/td/table[@class='bd1'][1]/tbody/tr[3]/td[2]//text()")
        set_help(item, "source", "/html/body[@class='body_bg']/div[@class='wrap']/table[1]/tbody/tr/td/table[@class='bd1'][1]/tbody/tr[2]/td[2]//text()")

        try:            
            update_time=sel.xpath("/html/body[@class='body_bg']/div[@class='wrap']/table[1]/tbody/tr/td/table[@class='bd1'][1]/tbody/tr[4]/td[4]//text()")[0].extract().encode("utf8")
            item["update_time"] = datetime.strptime(update_time, "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)
        return item
示例#12
0
    def parse_item(self, response):
        print "parse_item***********************"
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = "infoq"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.infoq.com"
        item["link"] = response.url
        item["content"] = "content"
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//div[@class='text_info']/div[@class='clear'][1]/preceding-sibling::*//text()")])
        item["summary"] = ""
        #item["author"] = ""
        set_help(item, "author", "//a[@class='editorlink f_taxonomyEditor']/text()")
        set_help(item, "title", "//head/title/text()")
        #item["title"] = "title"
        item["source"] = ""
        #set_help(item, "source", "//*[@id='pager-content']/div[1]/span[2]/a/text()")
        try:
            # pattern=re.compile("(\d{4})-(\d{2})-(\d{2}).*(\d{2}):(\d{2}):\d{2}")
            # update_time=sel.xpath('//div[@class="Byline"]/div[1]/span[1]/text()')[0].extract()
            # match=pattern.search(update_time)
            #item["update_time"] = datetime.strptime(match.group(), "%Y-%m-%d %H:%M:%S")
            item["update_time"] = "2016-01-01 00:00:00"
        except Exception as e:
            raise DropItem(str(e))
        print item
        return item
示例#13
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"上海有色网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "news.smm.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "/html/body/div[@class='layout clear']/div[@class='chief']/div[@id='content']/div[@class='bd']//text()"
            )
        ])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//h1//text()")

        return item
示例#14
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"中新网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "finance.chinanews.com"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//div[@id='cont_1_1_2']")])
        item["summary"] = ""
        item["author"] = ""
        #set_help(item, "author", "//span[@id='author_baidu']/text()")
        set_help(item, "title", "//*[@id='cont_1_1_2']/h1/text()")
        set_help(item, "source", "/div[@id='cont_1_1_2']/div[@class='left-time']/div[@class='left-t']/a[1]//text()")

        try:
            pattern=re.compile(".*(\d{4}).*(\d{2}).*(\d{2}).*(\d{2}):(\d{2}).*")
            update_time=sel.xpath('//*[@id="cont_1_1_2"]/div[4]/div[2]/text()')[0].extract().encode("utf8")
            match=pattern.search(update_time)
            item["update_time"] = datetime.strptime(match.group()[1:24], "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5 %H:%M")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)

        return item
示例#15
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"中国证券网-上市公司公告快讯"
        item["coll"] = getItemColl(self.name)
        item["host"] = "ggjd.cnstock.com"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//p")])
        item["summary"] = ""
        set_help(item, "author", "//*[@id='pager-content']/div[1]/span[3]/text()")
        set_help(item, "title", "//*[@id='pager-content']/h1/text()")
        set_help(item, "source", "//*[@id='pager-content']/div[1]/span[2]/a/text()")
        try:
            pattern=re.compile("(\d{4})-(\d{2})-(\d{2}).*(\d{2}):(\d{2}):\d{2}")
            update_time=sel.xpath('//*[@id="pager-content"]/div[1]/span[1]/text()')[0].extract()
            match=pattern.search(update_time)
            item["update_time"] = datetime.strptime(match.group(), "%Y-%m-%d %H:%M:%S")
        except Exception as e:
            raise DropItem(str(e))

        return item
示例#16
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init itemf
        item["name"] = u"央行发布"
        item["coll"] = getItemColl(self.name)
        item["host"] = "pbc.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td//text()"
            )
        ])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//h2//text()")

        return item
示例#17
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"网易财经"
        item["coll"] = getItemColl(self.name)
        item["host"] = "money.163.com"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "/html/body/div[@id='js-epContent']/div[@class='ep-content-bg clearfix']/div[@id='epContentLeft']/div[@class='ep-main-bg']/div[@id='endText']//text()"
            )
        ])
        item["summary"] = ""
        item["author"] = ""

        set_help(
            item, "title",
            "/html/body/div[@id='js-epContent']/div[@class='ep-content-bg clearfix']/div[@id='epContentLeft']/div[@class='ep-main-bg']/h1[@id='h1title']//text()"
        )

        return item
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"新华网-高层"
        item["coll"] = getItemColl(self.name)
        item["host"] = "news.xinhuanet.com/gc"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//*[@id='content']//p")])
        if not item["content"]:
            item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//div[@class='article']//text()")])

        try:
            update_time = sel.xpath("//*[@id='article']//*[@class='time']//text()")[0].extract().encode("utf8").strip()
            item["update_time"] = datetime.strptime(update_time, "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5 %H:%M:%S")
        except Exception as e:
            pass

	item["summary"] = ""
        item['author'] = ""
        set_help(item, "title", "//*[@id='title']/text()")
        set_help(item, "source", "//*[@id='source']/text()")
	
        return item
示例#19
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"新浪财经-宏观经济"
        item["coll"] = getItemColl(self.name)
        item["host"] = "finance.sina.com.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//*[@id='artibody']/p")])
        try:
            update_time = sel.xpath("//*[@id='wrapOuter']//*[@class='time-source']/text()")[0].extract().encode("utf8").strip()
            item["update_time"] = datetime.strptime(update_time, "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5\xc2\xa0%H:%M")
        except Exception as e:
            pass
            
	item["summary"] = ""
        item['author'] = ""
        set_help(item, "title", "//*[@id='artibodyTitle']/text()")
        set_help(item, "source", "//*[@data-sudaclick='media_name']/text()")

        return item
示例#20
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"财政部"
        item["coll"] = getItemColl(self.name)
        item["host"] = "mof.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//td//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//td[@id='Zoom']//text()")
        if len(item["title"].strip()) < 2:
            set_help(item, "title", "//td[@class='font_biao1']//text()")

        item["source"] = ""

        return item
示例#21
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"央行发布"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.pbc.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr/td/h2//text()")
        set_help(item, "source", "/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[@class='hui12'][2]/span[@id='laiyuan']//text()")

        try:            
            update_time=sel.xpath("/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[@class='hui12'][3]//text()")[1].extract().encode("utf8")
            item["update_time"] = datetime.strptime(update_time.strip(), "%Y-%m-%d %H:%M:%S")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)

        return item
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"新浪财经-产经滚动"
        item["coll"] = getItemColl(self.name)
        item["host"] = "finance.sina.com.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//p")])
        item["summary"] = ""
        item["author"] = ""
        #set_help(item, "author", "//span[@id='author_baidu']/text()")
        set_help(item, "title", "h1[@id='artibodyTitle']//text()")
        set_help(item, "source", "/div[@class='page-info']/span[@class='time-source']/span/a//text()")
        try:
            update_time = sel.xpath("//span[@class='time-source']//text()")[0].extract().encode("utf8")
            item["update_time"] = datetime.strptime(update_time[:25], "\n%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5\xc2\xa0%H:%M")
        except Exception as e:
            print str(e)
            return None

        return item
示例#23
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"techweb"
        item["coll"] = getItemColl(self.name)
        item["host"] = "techweb.com.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "/html/body/div[@class='main']/div[@class='left2']/div[@class='article']//text()"
            )
        ])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//h1//text()")

        return item
示例#24
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"新华网 -时政"
        item["coll"] = getItemColl(self.name)
        item["host"] = "http://news.xinhuanet.com/"
        item["link"] = response.url

        item["content"] = "<br />".join([
            n.extract().encode("utf8")
            for n in sel.xpath("//*[@id='content']//p")
        ])
        if not item["content"]:
            item["content"] = "<br />".join([
                n.extract().encode("utf8")
                for n in sel.xpath("//div[@class='article']")
            ])

        try:
            update_time = sel.xpath(
                "//*[@id='article']//*[@class='time']//text()")[0].extract(
                ).encode("utf8").strip()
            item["update_time"] = datetime.strptime(
                update_time,
                "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5 %H:%M:%S")
        except Exception as e:
            pass

        item["summary"] = ""
        item['author'] = ""
        set_help(item, "title", "//*[@id='title']/text()")
        set_help(item, "source", "//*[@id='source']/text()")

        return item
示例#25
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"中新网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "finance.chinanews.com"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8")
            for n in sel.xpath("//div[@id='cont_1_1_2']")
        ])
        item["summary"] = ""
        item["author"] = ""
        #set_help(item, "author", "//span[@id='author_baidu']/text()")
        set_help(item, "title", "//*[@id='cont_1_1_2']/h1/text()")
        set_help(
            item, "source",
            "/div[@id='cont_1_1_2']/div[@class='left-time']/div[@class='left-t']/a[1]//text()"
        )

        try:
            pattern = re.compile(
                ".*(\d{4}).*(\d{2}).*(\d{2}).*(\d{2}):(\d{2}).*")
            update_time = sel.xpath(
                '//*[@id="cont_1_1_2"]/div[4]/div[2]/text()')[0].extract(
                ).encode("utf8")
            match = pattern.search(update_time)
            item["update_time"] = datetime.strptime(
                match.group()[1:24],
                "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5 %H:%M")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)

        return item
示例#26
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"央广网-经济之声"
        item["coll"] = getItemColl(self.name)  # mongodb collection name
        item["host"] = "www.cnr.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8")
            for n in sel.xpath("//div[@class='TRS_Editor']")
        ])
        # summary 可以不要
        item["summary"] = ""
        item["author"] = ""
        set_help(
            item, "title",
            "/html/body/div[@id='Area']/div[@id='cntl']/div[@class='wh645 left']/p[@class='f22 lh30 yahei']//text()"
        )
        # source 可以不要
        set_help(
            item, "source",
            "/html/body/div[@id='Area']/div[@id='cntl']/div[@class='wh645 left']/p[@class='lh30 left f14 yahei']/a//text()"
        )

        try:
            item["update_time"] = datetime.strptime(
                sel.xpath(
                    "/html/body/div[@id='Area']/div[@id='cntl']/div[@class='wh645 left']/p[@class='lh30 left f14 yahei']//text()"
                )[0].extract().encode("utf8")[:19], "%Y-%m-%d %H:%M:%S")
        except Exception as e:
            print "get update_time failed, error:%s" % e
            return None

        return item
示例#27
0
    def parse_item(self, response):
        print "parse_item***********************"

        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = "infoq"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.infoq.com"
        item["link"] = response.url
        item["content"] = "content"
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "//div[@class='text_info']/div[@class='clear'][1]/preceding-sibling::*//text()"
            )
        ])
        item["summary"] = ""
        #item["author"] = ""
        set_help(item, "author",
                 "//a[@class='editorlink f_taxonomyEditor']/text()")
        set_help(item, "title", "//head/title/text()")
        #item["title"] = "title"
        item["source"] = ""
        #set_help(item, "source", "//*[@id='pager-content']/div[1]/span[2]/a/text()")
        try:
            # pattern=re.compile("(\d{4})-(\d{2})-(\d{2}).*(\d{2}):(\d{2}):\d{2}")
            # update_time=sel.xpath('//div[@class="Byline"]/div[1]/span[1]/text()')[0].extract()
            # match=pattern.search(update_time)
            #item["update_time"] = datetime.strptime(match.group(), "%Y-%m-%d %H:%M:%S")
            item["update_time"] = "2016-01-01 00:00:00"
        except Exception as e:
            raise DropItem(str(e))
        print item
        return item
示例#28
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"央行发布"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.pbc.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td//text()"
            )
        ])
        item["summary"] = ""
        item["author"] = ""
        set_help(
            item, "title",
            "/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr/td/h2//text()"
        )
        set_help(
            item, "source",
            "/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[@class='hui12'][2]/span[@id='laiyuan']//text()"
        )

        try:
            update_time = sel.xpath(
                "/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[@class='hui12'][3]//text()"
            )[1].extract().encode("utf8")
            item["update_time"] = datetime.strptime(update_time.strip(),
                                                    "%Y-%m-%d %H:%M:%S")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)

        return item
示例#29
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"阿思達克财经"
        item["coll"] = getItemColl(self.name)
        item["host"] = "aastocks.com"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//p//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//span[@id='lblSTitle']//text()")

        return item
示例#30
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init itemf
        item["name"] = u"央行发布"
        item["coll"] = getItemColl(self.name)
        item["host"] = "pbc.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//h2//text()")

        return item
示例#31
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"新浪财经"
        item["coll"] = getItemColl(self.name)
        item["host"] = "bn.sina.cn"
        item["link"] = "bnsina%s" % md5.md5(response.body).hexdigest()
        item["content"] = response.body
        item["summary"] = ""
        item["author"] = ""
        item["title"] = response.body.decode("utf8")[:20].encode("utf8")

        return item
示例#32
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"新浪财经"
        item["coll"] = getItemColl(self.name)
        item["host"] = "bn.sina.cn"
        item["link"] = "bnsina%s" % md5.md5(response.body).hexdigest()
        item["content"] = response.body
        item["summary"] = ""
        item["author"] = ""
        item["title"] = response.body.decode("utf8")[:20].encode("utf8")

        return item
示例#33
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"上海有色网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "news.smm.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("/html/body/div[@class='layout clear']/div[@class='chief']/div[@id='content']/div[@class='bd']//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//h1//text()")

        return item
示例#34
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"发改委"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.ndrc.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("/html/body[@class='body_bg1']/div[@id='wrapper']/div[@id='out-content']/div[@class='index_wrapper1 screen_width clearfix']/div[@class='Middle4']/div[@class='Middle4_body']/div[@id='zoom']/div[@class='TRS_Editor']/p//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "/html/body[@class='body_bg1']/div[@id='wrapper']/div[@id='out-content']/div[@class='index_wrapper1 screen_width clearfix']/div[@class='Middle4']/div[@class='Middle4_body']/div[@id='zoom']/div[@class='TRS_Editor']/p[1]/font//text()")

        return item
示例#35
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"百度百家"
        item["coll"] = getItemColl(self.name)
        item["host"] = "baijia.baidu.com/"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("/html[@class='expanded']/body/div[@id='body']/div[@class='l-main-col']/div[@id='page']//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//h1//text()")
        
        return item
示例#36
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"21经济"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.21jingji.com"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//div[@class='content']/section//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//h1//text()")

        return item
示例#37
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"中国政府网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "/html/body[@class='body_bg']/div[@class='wrap']/table[2]/tbody/tr/td[1]/table/tbody/tr/td/table[1]/tbody/tr[1]/td[@class='b12c']/p[2]//text()"
            )
        ])
        item["summary"] = ""
        item["author"] = ""
        set_help(
            item, "title",
            "/html/body[@class='body_bg']/div[@class='wrap']/table[1]/tbody/tr/td/table[@class='bd1'][1]/tbody/tr[3]/td[2]//text()"
        )
        set_help(
            item, "source",
            "/html/body[@class='body_bg']/div[@class='wrap']/table[1]/tbody/tr/td/table[@class='bd1'][1]/tbody/tr[2]/td[2]//text()"
        )

        try:
            update_time = sel.xpath(
                "/html/body[@class='body_bg']/div[@class='wrap']/table[1]/tbody/tr/td/table[@class='bd1'][1]/tbody/tr[4]/td[4]//text()"
            )[0].extract().encode("utf8")
            item["update_time"] = datetime.strptime(
                update_time, "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)
        return item
示例#38
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"华夏时报网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.chinatimes.cc"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "/html/body/div[@class='main']/div[@class='mainLeft']/div[@class='newList']/div[@class='content']/div[@class='infoMain']//text()"
            )
        ])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//h1//text()")
        set_help(
            item, "source",
            "/html/body/div[@class='main']/div[@class='mainLeft']/div[@class='newList']/div[@class='content']/div[@class='info']/p[@id='source_baidu']//text()"
        )

        try:
            pattern = re.compile(".*?(\d+?-\d+?-\d+?.*?\d+?:\d+?:\d+).*")
            update_time = sel.xpath(
                "/html/body/div[@class='main']/div[@class='mainLeft']/div[@class='newList']/div[@class='content']/div[@class='info']//text()"
            )[7].extract().encode("utf8")
            match = pattern.search(update_time)
            item["update_time"] = datetime.strptime(match.groups()[0],
                                                    "%Y-%m-%d %H:%M:%S")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)

        return item
示例#39
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"国资委"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.sasac.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "/html/body/div[@class='main center']/div[@id='con_con']//text()"
            )
        ])
        item["summary"] = ""
        item["author"] = ""
        set_help(
            item, "title",
            "/html/body/div[@class='main center']/div[@class='ttitle']/h1[@id='con_title']//text()"
        )
        set_help(
            item, "source",
            "/html/body/div[@class='main center']/div[@class='tinfo relative']/span[@id='con_ly']//text()"
        )

        try:
            update_time = sel.xpath(
                "/html/body/div[@class='main center']/div[@class='tinfo relative']/span[@id='con_time']//text()"
            )[0].extract().encode("utf8")
            item["update_time"] = datetime.strptime(update_time.strip(),
                                                    "%Y-%m-%d")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)
        return item
示例#40
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"证监会要闻"
        item["coll"] = getItemColl(self.name)
        item["host"] = "csrc.gov.cn"
        item["link"] = response.url
        
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//p//text()")])

        set_help(item, "title", "//div[@class='title']/text()")
        print item["title"], item["link"]

        return item
示例#41
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"一财网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.yicai.com"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//p//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//h1//text()")
        set_help(item, "source", "/html/body/div[@class='main']/div[@class='mainLeft']/div[@class='newList']/div[@class='content']/div[@class='info']/p[@id='source_baidu']//text()")

        return item
示例#42
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"腾讯财经"
        item["coll"] = getItemColl(self.name)
        item["host"] = "finance.qq.com"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("/html/body[@id='Wrap']/div[@class='body-Article-QQ']/div[@id='Main-Article-QQ']/div[@id='MainL']/div[@class='main']/div[@id='C-Main-Article-QQ']/div[@class='bd']/div[@id='Cnt-Main-Article-QQ']//text()")])
        item["summary"] = ""
        item["author"] = ""

        set_help(item, "title", "//h1//text()")

        return item
示例#43
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"阿思達克财经"
        item["coll"] = getItemColl(self.name)
        item["host"] = "aastocks.com"
        item["link"] = response.url
        item["content"] = "<br />".join(
            [n.extract().encode("utf8") for n in sel.xpath("//p//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//span[@id='lblSTitle']//text()")

        return item
示例#44
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"商务部日常新闻"
        item["coll"] = getItemColl(self.name)
        item["host"] = "mofcom.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "/html/body/div[@id='wrap']/div[@class='MainList']/div[@class='article_new mt10']/div[@id='zoom']//text()"
            )
        ])
        item["summary"] = ""
        item["author"] = ""
        set_help(
            item, "title",
            "/html/body/div[@id='wrap']/div[@class='MainList']/div[@class='article_new mt10']/h4[@id='artitle']//text()"
        )
        set_help(
            item, "source",
            "/html/body/div[@id='wrap']/div[@class='MainList']/div[@class='article_new mt10']/div[@id='arsource']/a//text()"
        )

        try:
            item["update_time"] = datetime.strptime(
                re.search('var tm = \"(.*?)\"', response.body).groups()[0],
                "%Y-%m-%d %H:%M:%S")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)
        return item
示例#45
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()
        
        # init item
        item["name"] = u"新华网财经联播"
        item["coll"] = getItemColl(self.name)
        item["host"] = "news.xinhuanet.com"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//p//text()")])


	item["summary"] = ""
        item['author'] = ""
        set_help(item, "title", "//h1//text()")
	
        return item
示例#46
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf-8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()
        # init item
        item["name"] = u"中新网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "chinanews.com"
        item["link"] = response.url

        item["content"] = "<br />".join([n.extract().encode("utf-8") for n in sel.xpath("//div[@id='cont_1_1_2']//text()")])
        item["summary"] = ""

        item["source"] = ""
        item["author"] = ""
        set_help(item, "title", "//h1//text()")

        return item
示例#47
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"中国政府网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.gov.cn"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("/html/body[@class='body_bg']/div[@class='wrap']/div[@class='frame-pane']/div[@class='article-colum']/div[@id='UCAP-CONTENT']/table[@id='printContent']//text()")])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "/html/body[@class='body_bg']/div[@class='wrap']/div[@class='frame-pane']/div[@class='article-colum']/div[@class='pages-title']//text()")
        set_help(item, "source", "/html/body[@class='body_bg']/div[@class='wrap']/div[@class='frame-pane']/div[@class='article-colum']/div[@class='pages-date']/span[@class='font'][2]//text()")

        
        return item
示例#48
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""
            
        sel = Selector(response)
        item = Item()
        dateNow=datetime.now().year

        # init item
        item["name"] = u"航运界"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.ship.sh"
        item["link"] = response.url
        item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//text()")])
        item["summary"] = ""
        item["author"] = ""
        #set_help(item, "author", "//span[@id='author_baidu']/text()")
        set_help(item, "title", "//h1//text()")

        return item
示例#49
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf-8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"证券时报"
        item["coll"] = getItemColl(self.name)
        item["host"] = "kuaixun.stcn.com"
        item["link"] = response.url

        item["content"] = "<br />".join(
            [n.extract().encode("utf-8") for n in sel.xpath("//p")])
        item["summary"] = ""

        item["source"] = sel.xpath(
            "//div[@class='intal_tit']/div[@class='info']/text()")[0].extract(
            ).encode("utf-8").split("\xef\xbc\x9a")[1]
        item["author"] = ""
        set_help(item, "title", "//div[@class='intal_tit']/h2/text()")
        try:
            pattern = re.compile(".*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})")
            update_time = sel.xpath(
                "//div[@class='intal_tit']/div[@class='info']/text()"
            )[0].extract()
            match = pattern.search(update_time)
            item["update_time"] = datetime.strptime(match.group(1),
                                                    "%Y-%m-%d %H:%M")
        except Exception as e:
            raise DropItem("get update_time failed")

        return item
示例#50
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"财新网"
        item["coll"] = getItemColl(self.name)
        item["host"] = "caixin.com"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8")
            for n in sel.xpath("//div[@id='Main_Content_Val']//p/text()")
        ])
        item["summary"] = ""
        item["author"] = ""
        set_help(item, "title", "//h1/text()")
        set_help(item, "source", "//div[@id='artInfo']/a/text()")

        try:
            pattern = re.compile(
                ".*(\d{4}).*(\d{2}).*(\d{2}).*(\d{2}):(\d{2}).*")
            update_time = sel.xpath(
                "//div[@id='artInfo']//text()")[0].extract().encode("utf8")
            match = pattern.search(update_time)
            item["update_time"] = datetime.strptime(
                match.group().strip()[0:23],
                "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5 %H:%M")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)
        return item
示例#51
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()

        # init item
        item["name"] = u"工信部"
        item["coll"] = getItemColl(self.name)
        item["host"] = "miit.com"
        item["link"] = response.url
        item["content"] = "<br />".join([
            n.extract().encode("utf8") for n in sel.xpath(
                "/html/body/div[@class='w980 center cmain']/div[@id='con_con']//text()"
            )
        ])
        item["summary"] = ""
        item["author"] = ""
        set_help(
            item, "title",
            "/html/body/div[@class='w980 center cmain']/div[@class='ctitle']/h1[@id='con_title']//text()"
        )
        set_help(item, "source", "//div[@id='artInfo']/a/text()")

        try:
            update_time = sel.xpath(
                "/html/body/div[@class='w980 center cmain']/div[@class='cinfo center']/span[@id='con_time']//text()"
            )[0].extract().encode("utf8")[15:]
            item["update_time"] = datetime.strptime(update_time, "%Y-%m-%d")
        except Exception as e:
            raise DropItem("get update_time failed, %s" % e)
        return item
示例#52
0
    def parse_item(self, response):
        def set_help(item, key, xpath):
            try:
                item[key] = sel.xpath(xpath)[0].extract().encode("utf8")
            except:
                item[key] = ""

        sel = Selector(response)
        item = Item()
        dateNow = datetime.now().year

        # init item
        item["name"] = u"航运界"
        item["coll"] = getItemColl(self.name)
        item["host"] = "www.ship.sh"
        item["link"] = response.url
        item["content"] = "<br />".join(
            [n.extract().encode("utf8") for n in sel.xpath("//text()")])
        item["summary"] = ""
        item["author"] = ""
        #set_help(item, "author", "//span[@id='author_baidu']/text()")
        set_help(item, "title", "//h1//text()")

        return item