def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"新浪财经-国内财经" item["coll"] = getItemColl(self.name) item["host"] = "finance.sina.com.cn" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath("//*[@id='artibody']/p") ]) try: update_time = sel.xpath( "//*[@id='wrapOuter']//*[@class='time-source']/text()" )[0].extract().encode("utf8").strip() item["update_time"] = datetime.strptime( update_time, "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5\xc2\xa0%H:%M") except Exception as e: pass item["summary"] = "" item['author'] = "" set_help(item, "title", "//*[@id='artibodyTitle']/text()") set_help(item, "source", "//*[@data-sudaclick='media_name']/text()") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"发改委" item["coll"] = getItemColl(self.name) item["host"] = "www.ndrc.gov.cn" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "/html/body[@class='body_bg1']/div[@id='wrapper']/div[@id='out-content']/div[@class='index_wrapper1 screen_width clearfix']/div[@class='Middle4']/div[@class='Middle4_body']/div[@id='zoom']/div[@class='TRS_Editor']/p//text()" ) ]) item["summary"] = "" item["author"] = "" set_help( item, "title", "/html/body[@class='body_bg1']/div[@id='wrapper']/div[@id='out-content']/div[@class='index_wrapper1 screen_width clearfix']/div[@class='Middle4']/div[@class='Middle4_body']/div[@id='zoom']/div[@class='TRS_Editor']/p[1]/font//text()" ) return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"财政部" item["coll"] = getItemColl(self.name) item["host"] = "mof.gov.cn" item["link"] = response.url item["content"] = "<br />".join( [n.extract().encode("utf8") for n in sel.xpath("//td//text()")]) item["summary"] = "" item["author"] = "" set_help(item, "title", "//td[@id='Zoom']//text()") if len(item["title"].strip()) < 2: set_help(item, "title", "//td[@class='font_biao1']//text()") item["source"] = "" return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"中国政府网" item["coll"] = getItemColl(self.name) item["host"] = "www.gov.cn" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "/html/body[@class='body_bg']/div[@class='wrap']/div[@class='frame-pane']/div[@class='article-colum']/div[@id='UCAP-CONTENT']/table[@id='printContent']//text()" ) ]) item["summary"] = "" item["author"] = "" set_help( item, "title", "/html/body[@class='body_bg']/div[@class='wrap']/div[@class='frame-pane']/div[@class='article-colum']/div[@class='pages-title']//text()" ) set_help( item, "source", "/html/body[@class='body_bg']/div[@class='wrap']/div[@class='frame-pane']/div[@class='article-colum']/div[@class='pages-date']/span[@class='font'][2]//text()" ) return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"中国证券网-上市公司公告快讯" item["coll"] = getItemColl(self.name) item["host"] = "ggjd.cnstock.com" item["link"] = response.url item["content"] = "<br />".join( [n.extract().encode("utf8") for n in sel.xpath("//p")]) item["summary"] = "" set_help(item, "author", "//*[@id='pager-content']/div[1]/span[3]/text()") set_help(item, "title", "//*[@id='pager-content']/h1/text()") set_help(item, "source", "//*[@id='pager-content']/div[1]/span[2]/a/text()") try: pattern = re.compile( "(\d{4})-(\d{2})-(\d{2}).*(\d{2}):(\d{2}):\d{2}") update_time = sel.xpath( '//*[@id="pager-content"]/div[1]/span[1]/text()')[0].extract() match = pattern.search(update_time) item["update_time"] = datetime.strptime(match.group(), "%Y-%m-%d %H:%M:%S") except Exception as e: raise DropItem(str(e)) return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf-8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"中新网" item["coll"] = getItemColl(self.name) item["host"] = "chinanews.com" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf-8") for n in sel.xpath("//div[@id='cont_1_1_2']//text()") ]) item["summary"] = "" item["source"] = "" item["author"] = "" set_help(item, "title", "//h1//text()") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"上海有色网" item["coll"] = getItemColl(self.name) item["host"] = "news.smm.cn" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "/html/body/div[@class='layout clear']/div[@class='chief']/div[@id='content']/div[@class='bd']//text()" ) ]) item["summary"] = "" item["author"] = "" set_help(item, "title", "//h1//text()") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"网易财经" item["coll"] = getItemColl(self.name) item["host"] = "money.163.com" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "/html/body/div[@id='js-epContent']/div[@class='ep-content-bg clearfix']/div[@id='epContentLeft']/div[@class='ep-main-bg']/div[@id='endText']//text()" ) ]) item["summary"] = "" item["author"] = "" set_help( item, "title", "/html/body/div[@id='js-epContent']/div[@class='ep-content-bg clearfix']/div[@id='epContentLeft']/div[@class='ep-main-bg']/h1[@id='h1title']//text()" ) return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"techweb" item["coll"] = getItemColl(self.name) item["host"] = "techweb.com.cn" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "/html/body/div[@class='main']/div[@class='left2']/div[@class='article']//text()" ) ]) item["summary"] = "" item["author"] = "" set_help(item, "title", "//h1//text()") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init itemf item["name"] = u"央行发布" item["coll"] = getItemColl(self.name) item["host"] = "pbc.gov.cn" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td//text()" ) ]) item["summary"] = "" item["author"] = "" set_help(item, "title", "//h2//text()") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"晨哨网" item["coll"] = getItemColl(self.name) item["host"] = "www.morningwhistle.com" item["link"] = response.url item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("/html/body/div[@class='con-w p-t-xxl']/div[@class='list-cont con-w m-t-md']/div[@class='list-l']/div[@class='cont-t']/div[@class='t-list']//text()")]) item["summary"] = "" item["author"] = "" set_help(item, "title", "//h1//text()") try: update_time = sel.xpath("/html/body/div[@class='con-w p-t-xxl']/div[@class='list-cont con-w m-t-md']/div[@class='list-l']/div[@class='cont-t']/p[@class='msg']/span[3]//text()")[0].extract().encode("utf8") item["update_time"] = datetime.strptime(update_time, "%Y-%m-%d %H:%M:%S") except Exception as e: raise DropItem("get update_time failed, %s" % e) return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"新华网 -时政" item["coll"] = getItemColl(self.name) item["host"] = "http://news.xinhuanet.com/" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath("//*[@id='content']//p") ]) if not item["content"]: item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath("//div[@class='article']") ]) try: update_time = sel.xpath( "//*[@id='article']//*[@class='time']//text()")[0].extract( ).encode("utf8").strip() item["update_time"] = datetime.strptime( update_time, "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5 %H:%M:%S") except Exception as e: pass item["summary"] = "" item['author'] = "" set_help(item, "title", "//*[@id='title']/text()") set_help(item, "source", "//*[@id='source']/text()") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"央广网-经济之声" item["coll"] = getItemColl(self.name) # mongodb collection name item["host"] = "www.cnr.cn" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath("//div[@class='TRS_Editor']") ]) # summary 可以不要 item["summary"] = "" item["author"] = "" set_help( item, "title", "/html/body/div[@id='Area']/div[@id='cntl']/div[@class='wh645 left']/p[@class='f22 lh30 yahei']//text()" ) # source 可以不要 set_help( item, "source", "/html/body/div[@id='Area']/div[@id='cntl']/div[@class='wh645 left']/p[@class='lh30 left f14 yahei']/a//text()" ) try: item["update_time"] = datetime.strptime( sel.xpath( "/html/body/div[@id='Area']/div[@id='cntl']/div[@class='wh645 left']/p[@class='lh30 left f14 yahei']//text()" )[0].extract().encode("utf8")[:19], "%Y-%m-%d %H:%M:%S") except Exception as e: print "get update_time failed, error:%s" % e return None return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"中新网" item["coll"] = getItemColl(self.name) item["host"] = "finance.chinanews.com" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath("//div[@id='cont_1_1_2']") ]) item["summary"] = "" item["author"] = "" #set_help(item, "author", "//span[@id='author_baidu']/text()") set_help(item, "title", "//*[@id='cont_1_1_2']/h1/text()") set_help( item, "source", "/div[@id='cont_1_1_2']/div[@class='left-time']/div[@class='left-t']/a[1]//text()" ) try: pattern = re.compile( ".*(\d{4}).*(\d{2}).*(\d{2}).*(\d{2}):(\d{2}).*") update_time = sel.xpath( '//*[@id="cont_1_1_2"]/div[4]/div[2]/text()')[0].extract( ).encode("utf8") match = pattern.search(update_time) item["update_time"] = datetime.strptime( match.group()[1:24], "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5 %H:%M") except Exception as e: raise DropItem("get update_time failed, %s" % e) return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"央行发布" item["coll"] = getItemColl(self.name) item["host"] = "www.pbc.gov.cn" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td//text()" ) ]) item["summary"] = "" item["author"] = "" set_help( item, "title", "/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr/td/h2//text()" ) set_help( item, "source", "/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[@class='hui12'][2]/span[@id='laiyuan']//text()" ) try: update_time = sel.xpath( "/html/body/div[@class='mainw950']/div[@id='pre']/div[@id='10929']/div[2]/table[2]/tbody/tr/td/table/tbody/tr/td/table[2]/tbody/tr/td[@class='hui12'][3]//text()" )[1].extract().encode("utf8") item["update_time"] = datetime.strptime(update_time.strip(), "%Y-%m-%d %H:%M:%S") except Exception as e: raise DropItem("get update_time failed, %s" % e) return item
def parse_item(self, response): print "parse_item***********************" def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = "infoq" item["coll"] = getItemColl(self.name) item["host"] = "www.infoq.com" item["link"] = response.url item["content"] = "content" item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "//div[@class='text_info']/div[@class='clear'][1]/preceding-sibling::*//text()" ) ]) item["summary"] = "" #item["author"] = "" set_help(item, "author", "//a[@class='editorlink f_taxonomyEditor']/text()") set_help(item, "title", "//head/title/text()") #item["title"] = "title" item["source"] = "" #set_help(item, "source", "//*[@id='pager-content']/div[1]/span[2]/a/text()") try: # pattern=re.compile("(\d{4})-(\d{2})-(\d{2}).*(\d{2}):(\d{2}):\d{2}") # update_time=sel.xpath('//div[@class="Byline"]/div[1]/span[1]/text()')[0].extract() # match=pattern.search(update_time) #item["update_time"] = datetime.strptime(match.group(), "%Y-%m-%d %H:%M:%S") item["update_time"] = "2016-01-01 00:00:00" except Exception as e: raise DropItem(str(e)) print item return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"21经济" item["coll"] = getItemColl(self.name) item["host"] = "www.21jingji.com" item["link"] = response.url item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//div[@class='content']/section//text()")]) item["summary"] = "" item["author"] = "" set_help(item, "title", "//h1//text()") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"新浪财经" item["coll"] = getItemColl(self.name) item["host"] = "bn.sina.cn" item["link"] = "bnsina%s" % md5.md5(response.body).hexdigest() item["content"] = response.body item["summary"] = "" item["author"] = "" item["title"] = response.body.decode("utf8")[:20].encode("utf8") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"中国政府网" item["coll"] = getItemColl(self.name) item["host"] = "www.gov.cn" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "/html/body[@class='body_bg']/div[@class='wrap']/table[2]/tbody/tr/td[1]/table/tbody/tr/td/table[1]/tbody/tr[1]/td[@class='b12c']/p[2]//text()" ) ]) item["summary"] = "" item["author"] = "" set_help( item, "title", "/html/body[@class='body_bg']/div[@class='wrap']/table[1]/tbody/tr/td/table[@class='bd1'][1]/tbody/tr[3]/td[2]//text()" ) set_help( item, "source", "/html/body[@class='body_bg']/div[@class='wrap']/table[1]/tbody/tr/td/table[@class='bd1'][1]/tbody/tr[2]/td[2]//text()" ) try: update_time = sel.xpath( "/html/body[@class='body_bg']/div[@class='wrap']/table[1]/tbody/tr/td/table[@class='bd1'][1]/tbody/tr[4]/td[4]//text()" )[0].extract().encode("utf8") item["update_time"] = datetime.strptime( update_time, "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5") except Exception as e: raise DropItem("get update_time failed, %s" % e) return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"国资委" item["coll"] = getItemColl(self.name) item["host"] = "www.sasac.gov.cn" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "/html/body/div[@class='main center']/div[@id='con_con']//text()" ) ]) item["summary"] = "" item["author"] = "" set_help( item, "title", "/html/body/div[@class='main center']/div[@class='ttitle']/h1[@id='con_title']//text()" ) set_help( item, "source", "/html/body/div[@class='main center']/div[@class='tinfo relative']/span[@id='con_ly']//text()" ) try: update_time = sel.xpath( "/html/body/div[@class='main center']/div[@class='tinfo relative']/span[@id='con_time']//text()" )[0].extract().encode("utf8") item["update_time"] = datetime.strptime(update_time.strip(), "%Y-%m-%d") except Exception as e: raise DropItem("get update_time failed, %s" % e) return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"华夏时报网" item["coll"] = getItemColl(self.name) item["host"] = "www.chinatimes.cc" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "/html/body/div[@class='main']/div[@class='mainLeft']/div[@class='newList']/div[@class='content']/div[@class='infoMain']//text()" ) ]) item["summary"] = "" item["author"] = "" set_help(item, "title", "//h1//text()") set_help( item, "source", "/html/body/div[@class='main']/div[@class='mainLeft']/div[@class='newList']/div[@class='content']/div[@class='info']/p[@id='source_baidu']//text()" ) try: pattern = re.compile(".*?(\d+?-\d+?-\d+?.*?\d+?:\d+?:\d+).*") update_time = sel.xpath( "/html/body/div[@class='main']/div[@class='mainLeft']/div[@class='newList']/div[@class='content']/div[@class='info']//text()" )[7].extract().encode("utf8") match = pattern.search(update_time) item["update_time"] = datetime.strptime(match.groups()[0], "%Y-%m-%d %H:%M:%S") except Exception as e: raise DropItem("get update_time failed, %s" % e) return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"一财网" item["coll"] = getItemColl(self.name) item["host"] = "www.yicai.com" item["link"] = response.url item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//p//text()")]) item["summary"] = "" item["author"] = "" set_help(item, "title", "//h1//text()") set_help(item, "source", "/html/body/div[@class='main']/div[@class='mainLeft']/div[@class='newList']/div[@class='content']/div[@class='info']/p[@id='source_baidu']//text()") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"证监会要闻" item["coll"] = getItemColl(self.name) item["host"] = "csrc.gov.cn" item["link"] = response.url item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//p//text()")]) set_help(item, "title", "//div[@class='title']/text()") print item["title"], item["link"] return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"阿思達克财经" item["coll"] = getItemColl(self.name) item["host"] = "aastocks.com" item["link"] = response.url item["content"] = "<br />".join( [n.extract().encode("utf8") for n in sel.xpath("//p//text()")]) item["summary"] = "" item["author"] = "" set_help(item, "title", "//span[@id='lblSTitle']//text()") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"商务部日常新闻" item["coll"] = getItemColl(self.name) item["host"] = "mofcom.gov.cn" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "/html/body/div[@id='wrap']/div[@class='MainList']/div[@class='article_new mt10']/div[@id='zoom']//text()" ) ]) item["summary"] = "" item["author"] = "" set_help( item, "title", "/html/body/div[@id='wrap']/div[@class='MainList']/div[@class='article_new mt10']/h4[@id='artitle']//text()" ) set_help( item, "source", "/html/body/div[@id='wrap']/div[@class='MainList']/div[@class='article_new mt10']/div[@id='arsource']/a//text()" ) try: item["update_time"] = datetime.strptime( re.search('var tm = \"(.*?)\"', response.body).groups()[0], "%Y-%m-%d %H:%M:%S") except Exception as e: raise DropItem("get update_time failed, %s" % e) return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"新华网财经联播" item["coll"] = getItemColl(self.name) item["host"] = "news.xinhuanet.com" item["link"] = response.url item["content"] = "<br />".join([n.extract().encode("utf8") for n in sel.xpath("//p//text()")]) item["summary"] = "" item['author'] = "" set_help(item, "title", "//h1//text()") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf-8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"证券时报" item["coll"] = getItemColl(self.name) item["host"] = "kuaixun.stcn.com" item["link"] = response.url item["content"] = "<br />".join( [n.extract().encode("utf-8") for n in sel.xpath("//p")]) item["summary"] = "" item["source"] = sel.xpath( "//div[@class='intal_tit']/div[@class='info']/text()")[0].extract( ).encode("utf-8").split("\xef\xbc\x9a")[1] item["author"] = "" set_help(item, "title", "//div[@class='intal_tit']/h2/text()") try: pattern = re.compile(".*(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})") update_time = sel.xpath( "//div[@class='intal_tit']/div[@class='info']/text()" )[0].extract() match = pattern.search(update_time) item["update_time"] = datetime.strptime(match.group(1), "%Y-%m-%d %H:%M") except Exception as e: raise DropItem("get update_time failed") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() dateNow = datetime.now().year # init item item["name"] = u"航运界" item["coll"] = getItemColl(self.name) item["host"] = "www.ship.sh" item["link"] = response.url item["content"] = "<br />".join( [n.extract().encode("utf8") for n in sel.xpath("//text()")]) item["summary"] = "" item["author"] = "" #set_help(item, "author", "//span[@id='author_baidu']/text()") set_help(item, "title", "//h1//text()") return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"工信部" item["coll"] = getItemColl(self.name) item["host"] = "miit.com" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath( "/html/body/div[@class='w980 center cmain']/div[@id='con_con']//text()" ) ]) item["summary"] = "" item["author"] = "" set_help( item, "title", "/html/body/div[@class='w980 center cmain']/div[@class='ctitle']/h1[@id='con_title']//text()" ) set_help(item, "source", "//div[@id='artInfo']/a/text()") try: update_time = sel.xpath( "/html/body/div[@class='w980 center cmain']/div[@class='cinfo center']/span[@id='con_time']//text()" )[0].extract().encode("utf8")[15:] item["update_time"] = datetime.strptime(update_time, "%Y-%m-%d") except Exception as e: raise DropItem("get update_time failed, %s" % e) return item
def parse_item(self, response): def set_help(item, key, xpath): try: item[key] = sel.xpath(xpath)[0].extract().encode("utf8") except: item[key] = "" sel = Selector(response) item = Item() # init item item["name"] = u"财新网" item["coll"] = getItemColl(self.name) item["host"] = "caixin.com" item["link"] = response.url item["content"] = "<br />".join([ n.extract().encode("utf8") for n in sel.xpath("//div[@id='Main_Content_Val']//p/text()") ]) item["summary"] = "" item["author"] = "" set_help(item, "title", "//h1/text()") set_help(item, "source", "//div[@id='artInfo']/a/text()") try: pattern = re.compile( ".*(\d{4}).*(\d{2}).*(\d{2}).*(\d{2}):(\d{2}).*") update_time = sel.xpath( "//div[@id='artInfo']//text()")[0].extract().encode("utf8") match = pattern.search(update_time) item["update_time"] = datetime.strptime( match.group().strip()[0:23], "%Y\xe5\xb9\xb4%m\xe6\x9c\x88%d\xe6\x97\xa5 %H:%M") except Exception as e: raise DropItem("get update_time failed, %s" % e) return item