def parse(self, response): item = FagaiweiItem() html = etree.HTML(response.text) divs = html.xpath("//div[@data-jsx='99852006']/div")[1:-3] dates = ''.join(list( html.xpath(".//div[@class='time']/text()"))).strip() for div in divs: item['webname'] = '财联社' item['web'] = 'http://cailianpress.com' try: da = ''.join( list(div.xpath( ".//div/div[@class='cTime']/text()"))).strip() if str(dates) in da: times = da else: times = dates + ' ' + da except Exception as e: times = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) content = ''.join( list( div.xpath(".//div[@class='newsRight']/p/text()"))).strip() if "【" in content: item['title'] = ''.join(list(content.split('】')[0] + "】")) if len(item["title"]) < 10: item['title'] = ''.join(list(content[0:30])) else: item['title'] = ''.join(list(content[0:30])) item['pub_time'] = times item['content'] = content item['url'] = 'http://cailianpress.com?' + str(times).replace( " ", "").replace(":", "") item["keyword"] = keyword.get_keyword(item["content"]) item['web_id'] = 73 # print(item) result = session.query(NewsItemInfo).filter_by(url=item['url'], web_id=73).count() if result: # print("{} 存在".format(item['url'])) pass else: yield item
def parse_page(self, response): item = FagaiweiItem() item['url'] = response.url item['pub_time'] = response.xpath("//span[@class='Ff']/text()").get() item['title'] = response.xpath("//h1/text()").get() content1 = ' '.join( list( response.xpath("//div[@class='artical_t']//span//text()") [0:-1].getall())) content2 = '\n'.join(list(response.xpath("//div[@class='artical_c']/p/text()").getall())) \ .replace('\u3000', '').replace('\xa0', '') item['content'] = content1 + '\n' + content2 item['web'] = 'http://www.cs.com.cn/sylm/jsxw/' item['webname'] = '中证网' item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item["keyword"] = keyword.get_keyword(item["content"]) item['web_id'] = 62 return item pass
def process_detail(self, response): item = FagaiweiItem() item['web_id'] = 34 item['url'] = response.url item['title'] = response.xpath( '//div[@class="article-header"]/h1/text()').extract_first( default='') item['web'] = 'http://kan.china.com/' item['webname'] = response.xpath( '//*[@id="article-source"]/text()').extract_first(default='热点新闻') item['pub_time'] = response.xpath( '//*[@id="article-data"]/text()').extract_first( default=datetime.now()) item['content'] = '\n'.join( response.xpath('//*[@id="main-content"]/p/text() | \ //*[@id="main-content"]/p/strong/text() | \ //*[@id="main-content"]/p/strong/span/text()' ).extract()) item["keyword"] = keyword.get_keyword(item["content"]) yield item
def parse(self, response): titles = response.xpath( "//div[contains(@class,'title')]/a/text()").getall() urls = response.xpath( "//div[contains(@class,'items-col')]/a/@href").getall() dates = response.xpath( "//div[@class='items']/div[contains(@class,'date')]/text()" ).getall() dabao = zip(urls, titles, dates) # print(len(urls), len(dates)) for url, title, time in dabao: if url[-4:] == '.pdf': result = session.query(NewsItemInfo).filter_by( url=url, web_id=79).count() if result: # print("PDF 文件地址: {} 存在".format(url)) pass else: item = FagaiweiItem() item['webname'] = '中华交易服务' item['web'] = response.url title = ''.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \ .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \ .replace('?', '').replace('\t', '') time = time.replace('/', '-') item['pub_time'] = datetime.strptime(time, '%d-%m-%Y') item['url'] = url item['title'] = title content = pdf.main(url=url, fileName=title) if content == '': item['content'] = '请点击原文链接查看' + response.url else: item['content'] = ''.join(list(content)) # item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item["keyword"] = keyword.get_keyword(item["content"]) item['web_id'] = 79 # print(item) yield item else: pass
def parse(self, response): item = FagaiweiItem() urls = response.xpath( "//ul[@class='gg-list']/li/span[@class='tit']/a/@href").getall() titles1 = response.xpath( "//ul[@class='gg-list']/li/span[@class='tit']/a/text()").getall() titles2 = response.xpath( "//ul[@class='gg-list']/li/span[@class='code']/a/text()").getall() times = response.xpath( "//ul[@class='gg-list']/li/span[@class='time']/text()").getall() dabao = zip(urls, titles1, titles2, times) for url, title1, title2, time in dabao: title = title2 + ' ' + title1 title = ''.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \ .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \ .replace('?', '') shijian, filename = re.findall(r'=(\d{8})(\w+)', url)[0] url2 = 'http://php.cnstock.com/texts/2018/' + shijian + '/' + filename + '.pdf' durl = url2 # PDF文件下载地址 if durl[-4:] == '.pdf': # print("==================================\n{}".format(durl)) result = session.query(NewsItemInfo).filter_by( url=url2, web_id=67).count() if result: # print("PDF 文件地址: {} 存在".format(url2)) pass else: content = pdf.main(url=url2, fileName=title) if len(content) == 0: item['content'] = '请点击原文链接查看' else: item['content'] = ''.join(list(content)) item['web_id'] = 67 item['title'] = title time = time.replace('(', '').replace(')', '') item['pub_time'] = time item['webname'] = '中国证券网信息披露平台' item['web'] = response.url item['url'] = url2 item["keyword"] = keyword.get_keyword(item["content"]) yield item
def get_detail(self, response): item = FagaiweiItem() item["url"] = response.url # item["title"] = response.meta["title"] item["title"] = response.xpath("//h1/text()").get() # print(response.url) contents = "".join( response.xpath('//*[@id="UCAP-CONTENT"]/p/text()|\ //*[@id="UCAP-CONTENT"]/p/span/span/text()|\ //div[@class="pages_content"]/p/text()|\ //div[@class="pages_content"]/p/a/text()|\ //div[@class="pages_content"]/div/p/text()|\ //*[@id="UCAP-CONTENT"]/p/span/text()' ).extract()) if contents == "": item["content"] = "可能是图片或表格 打开原网站查看" else: item["content"] = contents date = "".join( response.xpath('//div[@class="pages-date"]/text()').extract()) if date: dates = str(date).replace(" ", "").replace("\r", "").replace( "\n", "") + ":00" date = datetime.datetime.strptime(dates, '%Y-%m-%d %H:%M:%S') else: date = response.meta["date"] item["pub_time"] = date from_s = "".join( response.xpath('//div[@class="pages-date"]/span/text()').extract()) if from_s == "": webname = "国务院新闻" else: webname = from_s item["webname"] = webname.replace("来源:", "") item["web"] = response.meta["laiyuan"] item["keyword"] = keyword.get_keyword(item["content"]) item["web_id"] = 1 return item
def process_detail(self, response): item = FagaiweiItem() item['web_id'] = 22 item['url'] = response.url item['title'] = response.meta.get('title') item['web'] = response.meta.get('web') # item['keyword'] = '' news_about = response.xpath( '//div[@class="Article_61"]/h3[@class="daty"]/div/em[1]/text()').extract_first() + ' ' item['webname'] = ''.join(re.findall(r'来源:(.*?)\s', news_about)) time = response.xpath('//div[@class="Article_61"]/h3[@class="daty"]/div/em[2]/text()').extract_first() item['pub_time'] = ''.join(re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}', time)) content = '\n'.join(response.xpath('//div[@class="Article_61"]/div[@class="content"]/div/div/p/text() | \ //div[@class="Article_61"]/div[@class="content"]/div/div/p/font/text() | \ //div[@class="Article_61"]/div[@class="content"]/div/div/p/strong/text() ').extract()) if not content: content = '这可能是图片或者文件,打开查看!' item['content'] = content item["keyword"] = keyword.get_keyword(item["content"]) yield item
def process_detail(self, response): item = FagaiweiItem() if response.xpath('//div[@class="news_txt"]'): # 为了排除几个特殊的网址加入判断 item["web_id"] = 37 item["url"] = response.url item["title"] = response.xpath( '//h1[@class="news_title"]/text()').extract_first() item["pub_time"] = response.xpath( '//div[@class="news_about"]/p[2]/text()').extract_first( ).strip() item["content"] = '\n'.join( response.xpath('//div[@class="news_txt"]/div/text() | \ //div[@class="news_txt"]/text() | \ //div[@class="news_txt"]/strong/text() ' ).extract()) item["webname"] = response.xpath( '//div[@class="news_about"]/p[1]/text()').extract_first( ).strip() item["web"] = response.meta.get('web') item["keyword"] = keyword.get_keyword(item["content"]) yield item
def get_detail(self, response): item = FagaiweiItem() item["url"] = response.url item["pub_time"] = response.meta["date"] item["title"] = response.meta["title"] form_s = "".join(response.xpath('//div[@class="right_md_laiy"]/h4/text()').extract()) form_s = form_s.split(" ")[0].replace("一", "") if form_s != "": item["webname"] = form_s else: item["webname"] = "中国工程院" item["web"] = response.meta["laiyuan"] # item["keyword"] = "" item["web_id"] = 16 contents = "".join(response.xpath('\ //*[@id="zoom"]/div/p/text()|\ //*[@id="zoom"]/div/p/span/text()|\ //*[@id="zoom"]/strong/span/p/strong/text()|\ //*[@id="zoom"]/p/text()|\ //*[@id="zoom"]/p/a/text()|\ //*[@id="zoom"]/p/b/span/text()|\ //*[@id="zoom"]/p/strong/text()|\ //*[@id="zoom"]/p/span/text()|\ //*[@id="zoom"]/p/span/span/text()|\ //*[@id="zoom"]/span/p/text()|\ //*[@id="zoom"]/span/p/a/text()|\ //*[@id="zoom"]/span/p/a/font/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/strong/span/p/span/strong/span/span/strong/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/span/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/text()|\ //*[@id="zoom"]/span/strong/span/span/p/strong/text()').extract()) # print(contents) if contents != "": item["content"] = contents.replace("\u3000", "").replace("\xa0", "").replace("\u2002", "") else: item["content"] = "可能是图片 请打开详情页查看" item["keyword"] = keyword.get_keyword(item["content"]) return item
def get_detail(self, response): item = FagaiweiItem() item["url"] = response.url item["title"] = response.meta["title"] item["pub_time"] = response.meta["date"] contents = "\n".join(response.xpath('\ //*[@id="Zoom"]/span/p/text()|\ //*[@id="Zoom"]/span/p/font/text/text()|\ //*[@id="Zoom"]/span/p/font/strong/text/text()|\ //div[@class="TRS_Editor"]/div/b/text()|\ //div[@class="TRS_Editor"]/p/text()|\ //div[@class="TRS_Editor"]/p/font/text()|\ //div[@class="TRS_Editor"]/p/strong/text()|\ //div[@class="TRS_Editor"]/p/a/text()|\ //div[@class="TRS_Editor"]/p/a/font/text()|\ //div[@class="TRS_Editor"]/div/p/text()|\ //div[@class="TRS_Editor"]/div/text/text()|\ //*[@id="neirongText"]/div/p/text()|\ //*[@id="neirongText"]/div/p/text/text()\ //*[@id="neirongText"]/div/p/strong/text/text()').extract()) if contents == "": item["content"] = "可能是图片或表格 打开原网站查看" else: item["content"] = contents.replace("\u3000", " ").replace("\xa0", " ") form_s = "".join(response.xpath('//ul/li[1]/text()').extract()) if form_s == "": # 只能使用正则进行匹配 com = re.compile(r'laiyuan = "(.*?)";') form_s = "".join(re.findall(com, response.text)) if form_s == "": form_s = "海南省政府网站" else: form_s = form_s item["webname"] = form_s.replace("来源:", "") item["web"] = response.meta["laiyuan"] item["keyword"] = keyword.get_keyword(item["content"]) item["web_id"] = 23 return item
def parse_page(self, response): item = FagaiweiItem() item['webname'] = '投资时报' item['web'] = response.meta['url'] item['title'] = response.xpath("//h2/text()").get().replace('\xa0', '/n') item['url'] = response.url item['content'] = ''.join(list(response.xpath("//div[@class='para_ycont']/p/text()|" "//div[@class='para_ycont']/p/span/text()|" "//div[@class='para_ycont']/div/text()|" "//div[@class='para_ycont']/text()").getall())) \ .replace('\r\n', '').replace('\xa0', '') times = ''.join(list(response.xpath("//p[contains(@class,'s14')]/text()|" "//p[contains(@class,'s14')][1]/text()").getall())) item['pub_time'] = re.search(r'[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}', times).group() + ':00' item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item["keyword"] = keyword.get_keyword(item["content"]) item['web_id'] = 28 # time.sleep(0.5) return item pass
def get_detail(self, response): item = FagaiweiItem() item["url"] = response.url item["pub_time"] = response.meta["date"] item["title"] = response.meta["title"] item["web"] = response.meta["laiyuan"] item["web_id"] = 9 contents = "".join( response.xpath('//*[@id="ziti"]/p/text()|\ //table[@class="MsoNormalTable"]/tbody/tr/td/p/span/span/span/text()|\ //table[@class="MsoNormalTable"]/tbody/tr/td/p/span/span/span/span/text()|\ //*[@id="ziti"]/p/font/text()|\ //*[@id="ziti"]/p/span/text()|\ //*[@id="ziti"]/p/span/font/text()|\ //*[@id="ziti"]/p/span/span/text()|\ //*[@id="ziti"]/p/span/span/span/text()|\ //*[@id="ziti"]/p/span/span/span/span/text()|\ //*[@id="ziti"]/p/span/span/span/span/span/text()|\ //*[@id="ziti"]/p/span/span/span/span/span/span/text()|\ //*[@id="ziti"]/p/b/span/span/span/text()|\ //*[@id="ziti"]/p/span/span/span/font/text()|\ //*[@id="ziti"]/p/text()').extract( )) # print(contents) if contents != "": item["content"] = contents.replace("\u3000", "").replace("\xa0", "") else: item["content"] = "国家体育局 可能是图片 打开原文查看" form_s = "".join( response.xpath('//div[@class="wz_info"]/span[2]/text()').extract()) form_s = form_s.replace("来源:", "") if form_s != "": webname = form_s else: webname = "国家体育局" item["webname"] = webname item["keyword"] = keyword.get_keyword(item["content"]) return item
def parse(self, response): info_list = response.xpath( '//body/table[2]/tr[3]/td/table/tr[contains(@class,"row")]') for info in info_list: item = FagaiweiItem() url = 'http://www.hkexnews.hk' + info.xpath( './td[4]/a/@href').extract_first(default='') # print(url) if url[-4:] == '.pdf': result = session.query(NewsItemInfo).filter_by( url=url, web_id=80).count() if result: # print("PDF 文件地址: {} 存在".format(url)) pass else: item['url'] = url title = info.xpath('./td[3]/nobr/text()').extract_first( ) + ':' + info.xpath('./td[4]/div/text()').extract_first( default='').strip() title = ''.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \ .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \ .replace('?', '') item['title'] = title item['web'] = response.url item['webname'] = '披漏网' time = ' '.join( info.xpath('./td[1]/text()').extract()).replace( '/', '-') item['pub_time'] = datetime.strptime( time, '%d-%m-%Y %H:%M') content = pdf.main(url=url, fileName=title) if len(content) == 0: item['content'] = '这可能是图片或者文件,打开查看!' else: item['content'] = ''.join(list(content)) item['web_id'] = 80 item["keyword"] = keyword.get_keyword(item["content"]) yield item
def parse_item(self, response): conten_detail = response.xpath('//*[@id="p-detail"]').extract_first( default='') if response.status != 200 or not conten_detail: pass else: item = FagaiweiItem() item['webname'] = response.css('#source::text').extract_first( '新华国际') item['web'] = re.split(r'[0-9]+', response.url)[0] item['title'] = response.xpath( '//div[@class="h-title"]/text()').extract_first( default=None).strip() item['pub_time'] = response.css('.h-time::text').extract_first() content = '\n'.join( response.xpath('//div[@id="p-detail"]//p//strong/text() | \ //div[@id="p-detail"]//p//strong/font/text() | \ //div[@id="p-detail"]//p/font/text() |\ //div[@id="p-detail"]//p/text()| \ //div[@id="p-detail"]//p/font/strong/text()| \ //div[@id="p-detail"]//p/p/text() ' ).extract()) if content != '\n': content = re.sub('\u3000', '', content) item['content'] = content item['url'] = response.url item['add_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H-%M-%S') item['web_id'] = 40 item["keyword"] = keyword.get_keyword(item["content"]) result = session.query(NewsItemInfo).filter_by(url=item['url'], web_id=40).count() if result: # print("{} 存在".format(item['url'])) pass else: yield item
def process_detail(self, response): item = FagaiweiItem() item['web_id'] = 65 item['url'] = response.url item['title'] = response.xpath( '//div[@class="news_content"]/h1/text()').extract_first(default='') item['web'] = response.meta.get('web') news_about = response.xpath( '//div[@class="news_content"]/div[@class="info_news"]/text()' ).extract_first(default='') + ' ' item['webname'] = re.search(r'来源:(.*?)\s', news_about).group(1) item['pub_time'] = re.search(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2})', news_about).group(1) content = '\n'.join( response.xpath( '//div[@class="news_content"]/div[@class="content"]/p/text() | \ //div[@class="news_content"]/div[@class="content"]/p/strong/text()' ).extract()).replace('\u2002', '').replace('\xa0', '') if not content: content = '这可能是图片或者文件,打开查看!' item['content'] = content item["keyword"] = keyword.get_keyword(item["content"]) yield item
def get_detail(self, response): item = FagaiweiItem() item["url"] = response.url item["pub_time"] = response.meta["date"] item["title"] = response.meta["title"] item["webname"] = "海关总署" item["web"] = response.meta["laiyuan"] item["web_id"] = 11 contents = "".join( response.xpath('//*[@id="easysiteText"]/p/text()|\ //p[@class="p1"]/text()|\ //p[@class="p1"]/span/text()|\ //*[@id="easysiteText"]/p/strong/text()|\ //*[@id="easysiteText"]/p/strong/text()' ).extract()) # print(contents) if contents != "": item["content"] = contents.replace("\u3000", "").replace("\xa0", "") else: item["content"] = "可能是图片 请打开详情页查看" item["keyword"] = keyword.get_keyword(item["content"]) return item
def process_detail(self, response): item = FagaiweiItem() item['web_id'] = 74 item['url'] = response.url item['title'] = ''.join( response.xpath('//div[@class="newscontent_right2"]/h1/text()| \ //div[@class="newscontent_right"]/h1/text() ' ).extract()) item['web'] = response.meta.get('web') item['webname'] = response.xpath( '//div[@class="newscontent_right2"]/div[@class="content_info clearfix"]/span[1]/i[@class="zhuoze"]/a/@title | \ //div[@class="newscontent_right2"]/div[@class="content_info clearfix"]/span[1]/i[@class="zhuoze"]/text()' ).extract_first(default='全景网').strip() time = ''.join( response.xpath( '//div[@class="newscontent_right2"]/div[@class="content_info clearfix"]/span[1]/time/text()|\ //div[contains(@class,"content_info")]/span[@class="left"]/text()' ).extract()).strip().replace('月', '-').replace('日', '') if '2018' not in time: time = '2018-' + time item['pub_time'] = time content = '\n'.join( response.xpath( '//div[@class="newscontent_right2"]/div[@class="article_content2"]//div/p/text()| \ //div[@class="newscontent_right2"]/div[@class="article_content2"]//div/p/a/text()| \ //div[@class="newscontent_right2"]/div[@class="article_content2"]//div/p/a/text()| \ //div[@class="newscontent_right2"]/div[@class="article_content2"]/div[2]/div/p/text()| \ //div[@class="newscontent_right2"]/div[@class="article_content2"]/div[2]/p/font/text()| \ //div[@class="newscontent_right2"]/div[@class="article_content2"]/div[2]/div/p//strong/text()| \ //div[@class="newscontent_right2"]/div[@class="article_content2"]/div[2]/div/p//font/text()| \ //div[@class="article_content"]/p/text()' ).extract()) if not content: content = '这可能是图片或者文件,打开查看!' item['content'] = content.replace('\u3000', '') item["keyword"] = keyword.get_keyword(item["content"]) yield item
def get_detail(self, response): item = FagaiweiItem() item["url"] = response.url item["title"] = response.meta["title"] item["web"] = response.meta["laiyuan"] contents = "".join( response.xpath('\ //*[@id="zoom"]/p/text()|\ //*[@id="zoom"]/p/strong/text()|\ //*[@id="zoom"]/p/span/text()|\ //*[@id="zoom"]/p/span/span/text()|\ //*[@id="zoom"]/p/span/strong/span/text()|\ //*[@id="zoom"]/strong/text()|\ //*[@id="zoom"]/span/text()|\ //*[@id="zoom"]/div/span/text()|\ //*[@id="zoom"]/text()').extract()) item["content"] = contents.replace("\u3000", "") form_s = "".join( response.xpath('//*[@id="container"]/div/div[2]/ul[1]/li[1]/text()' ).extract()) if form_s == "": form_s = "最高人民法院新闻" item["webname"] = form_s.replace("来源:", "") date = "".join( response.xpath('//*[@id="container"]/div/div[2]/ul[1]/li[2]/text()' ).extract()) date_s = date.split("间")[-1][1:] if date_s == "": date_s = response.meta["date"] item["pub_time"] = date_s item["web_id"] = 5 item["keyword"] = keyword.get_keyword(item["content"]) # print(item) return item
def get_detail(self, response): item = FagaiweiItem() item["url"] = response.url item["pub_time"] = response.meta["date"] item["title"] = response.meta["title"] form_s = "".join( response.xpath( '//div[@class="right_md_laiy"]/h4/text()').extract()) if form_s != "": item["webname"] = form_s else: item["webname"] = "国家粮食和物资储备局门户网站 " item["web"] = response.meta["laiyuan"] # item["keyword"] = "" item["web_id"] = 18 contents = "".join( response.xpath( '//div[@class="detail-pane search-help"]/table/tr/td/p/font/text()|\ //ul[@class="lsj_spe_list"]/li/div/text()|\ //*[@id="UCAP-CONTENT"]/p/text()|\ //*[@id="UCAP-CONTENT"]/p/span/span/text()|\ //div[@class="pages_content"]/p/text()|\ //div[@class="pages_content"]/p/a/text()|\ //div[@class="pages_content"]/div/p/text()|\ //*[@id="UCAP-CONTENT"]/p/span/text()|\ //ul[@class="lsj_spe_list"]/li/div/a/text()'). extract()) if contents != "": item["content"] = contents.replace("\u3000", "").replace( "\xa0", "").replace("\u2002", "") else: item["content"] = "可能是图片 请打开详情页查看" item["keyword"] = keyword.get_keyword(item["content"]) return item
def process_detail(self, response): # print(response.url) item = FagaiweiItem() item['web_id'] = 39 item['url'] = response.url item['title'] = response.css('.post-title strong::text').extract_first( default='') item['web'] = response.meta.get('web') item['webname'] = "龙腾网" news_about = response.xpath( 'string(//div[@class="post-param"])').extract_first() item['pub_time'] = re.search(r'(\d{4}-\d{2}-\d{2})', news_about).group(1) content = ''.join( response.xpath( 'string(//div[@class="post-content"])').extract()).replace( '\xa0', '').replace('\r', '') comment = ''.join( response.xpath( 'string(//div[@class="post-comment"])').extract()).replace( '\xa0', '').replace('\r', '') item['content'] = "正文翻译:\n" + content + "评论翻译:\n" + comment item["keyword"] = keyword.get_keyword(item["content"]) yield item
def process_detail(self, response): item = FagaiweiItem() item['web_id'] = 96 item['url'] = response.url item['title'] = response.xpath( '//div[@class="article-content"]/h1/text()').extract_first( default='') item['web'] = response.meta.get('web') news_about = response.xpath( '//div[@class="article-content"]/div[contains(@class,"user-info-box")]//span[@class="time1"]/text()' ).extract_first() item['webname'] = '观察者网' item['pub_time'] = self.process_time(news_about) content = '\n'.join(response.xpath('//div[contains(@class,"article-txt")]/div/p/text() | \ //div[contains(@class,"article-txt")]/div/p/strong/text() | \ //div[contains(@class,"article-txt")]/p/text() | \ //div[contains(@class,"article-txt")]/p/text() | \ //div[contains(@class,"article-txt")]/p/a/text()' ).extract()) \ .replace('\r\n', '').replace('\xa0', '') if not content: content = '这可能是图片或者文件,打开查看!' item['content'] = content item["keyword"] = keyword.get_keyword(item["content"]) yield item
def get_detail(self, response): item = FagaiweiItem() item["url"] = response.url item["pub_time"] = response.meta["date"] item["title"] = response.meta["title"] contents = "".join( response.xpath('\ //div[@class="TRS_Editor"]/font/font/span/p/text()|\ //div[@class="TRS_Editor"]/font/font/span/p/span/text()|\ //div[@class="TRS_Editor"]/font/font/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/font/font/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/p/span/font/strong/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/p/strong/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/span/text()|\ //div[@class="TRS_Editor"]/span/font/text()|\ //div[@class="TRS_Editor"]/span/span/text()|\ //div[@class="TRS_Editor"]/span/font/p/span/font/text()|\ //div[@class="TRS_Editor"]/span/font/p/span/font/span/text()|\ //div[@class="TRS_Editor"]/span/h2/span/text()|\ //div[@class="TRS_Editor"]/span/p/text()|\ //div[@class="TRS_Editor"]/span/p/span/text()|\ //div[@class="TRS_Editor"]/span/strong/span/span/p/strong/text()|\ //div[@class="TRS_Editor"]/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/b/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/b/span/span/p/b/span/text()|\ //div[@class="TRS_Editor"]/strong/font/p/text()|\ //div[@class="TRS_Editor"]/strong/font/p/a/text()|\ //div[@class="TRS_Editor"]/strong/font/p/a/font/text()|\ //div[@class="TRS_Editor"]/p/text()|\ //div[@class="TRS_Editor"]/p/a/text()|\ //div[@class="TRS_Editor"]/p/b/text()|\ //div[@class="TRS_Editor"]/p/b/span/text()|\ //div[@class="TRS_Editor"]/p/a/span/text()|\ //div[@class="TRS_Editor"]/p/u/span/a/text()|\ //div[@class="TRS_Editor"]/p/span/a/span/text()|\ //div[@class="TRS_Editor"]/p/span/text()|\ //div[@class="TRS_Editor"]/p/span/span/text()|\ //div[@class="TRS_Editor"]/p/strong/text()|\ //div[@class="TRS_Editor"]/p/strong/font/text()|\ //div[@class="TRS_Editor"]/p/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/text()|\ //div[@class="TRS_Editor"]/p/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/span/font/text()|\ //div[@class="TRS_Editor"]/p/font/strong/text()|\ //div[@class="TRS_Editor"]/p/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/font/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/strong/text()|\ //div[@class="TRS_Editor"]/p/span/font/text()|\ //div[@class="TRS_Editor"]/text()|\ //div[@class="TRS_Editor"]/a/text()|\ //div[@class="TRS_Editor"]/font/text()|\ //div[@class="TRS_Editor"]/div/text()|\ //div[@class="TRS_Editor"]/div/a/text()|\ //div[@class="TRS_Editor"]/div/b/text()|\ //div[@class="TRS_Editor"]/div/span/text()|\ //div[@class="TRS_Editor"]/div/span/a/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/span/a/text()|\ //div[@class="TRS_Editor"]/div/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/text()|\ //div[@class="TRS_Editor"]/div/font/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/sup/text()|\ //div[@class="TRS_Editor"]/div/font/sup/span/text()|\ //div[@class="TRS_Editor"]/div/font/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/b/span/text()|\ //div[@class="TRS_Editor"]/div/p/b/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/text()|\ //div[@class="TRS_Editor"]/div/p/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/font/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/a/strong/text()|\ //div[@class="TRS_Editor"]/div/p/a/span/text()|\ //div[@class="TRS_Editor"]/div/p/a/strong/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/span/a/text()|\ //div[@class="TRS_Editor"]/div/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/span/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/span/span/text()|\ //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/text()|\ //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/a/span/text()|\ //div[@class="TRS_Editor"]/div/text()|\ //div[@class="TRS_Editor"]/div/div/text()|\ //div[@class="TRS_Editor"]/div/div/p/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/font/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/span/font/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/span/p/text()|\ //div[@class="cen_main"]/div/h1/text()|\ //div[@class="cen_main"]/div/div/p/text()|\ //div[@class="cen_main"]/div/div/p/span/text()|\ //div[@class="cen_main"]/div/div/p/span/span/text()|\ //div[@class="cen_main"]/div/div/div/p/span/text()|\ //div[@class="cen_main"]/div/div/div/p/span/span/text()|\ //font[@face="Calibri"]/text()|\ //font[@face="Calibri"]/span/text()|\ //font[@face="Calibri"]/span/span/text()|\ //*[@id="ozoom"]/p/text()|\ //*[@id="zoom"]/div/p/text()|\ //*[@id="zoom"]/div/p/span/text()|\ //*[@id="zoom"]/strong/span/p/strong/text()|\ //*[@id="zoom"]/p/text()|\ //*[@id="zoom"]/p/a/text()|\ //*[@id="zoom"]/p/strong/text()|\ //*[@id="zoom"]/p/span/text()|\ //*[@id="zoom"]/span/p/text()|\ //*[@id="zoom"]/span/p/a/text()|\ //*[@id="zoom"]/span/p/a/font/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/strong/span/p/span/strong/span/span/strong/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/span/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/text()|\ //*[@id="zoom"]/span/strong/span/span/p/strong/text()' ).extract()) if contents == "": contents = "可能是图片或表格 打开原网站查看" item["content"] = contents.replace("\u3000", "").replace("\xa0", "") \ .replace("\t\n", "").replace("\t", "") # .replace(" ", "") item["keyword"] = "".join( response.xpath('//dl[@class="xl_guanjc"]/dd/text()').extract()) if item["keyword"] == "": item["keyword"] = keyword.get_keyword(item["content"]) # form_s = "".join(response.xpath('//div[@class="xilan_nengr"]/h2/text()').extract()) # print(form_s) webname = "".join( re.findall(re.compile(r'var docsource="(.*?)";'), response.text)) if webname == "": webname = "中国期货业协会" else: webname = webname item["webname"] = webname item["web"] = response.meta["laiyuan"] item["web_id"] = 57 return item
def get_detail(self, response): item = FagaiweiItem() item["url"] = response.url item["pub_time"] = response.meta["date"] item["title"] = response.meta["title"] form_s = "".join( response.xpath( '//*[@id="source"]/span/text()|//span[@class="aticle-src"]/text()' ).extract()) form_s = form_s.split(" ")[0].replace("一", "") if form_s != "": item["webname"] = form_s.replace("\r", "").replace("\n", "").replace(" ", "") else: item["webname"] = "中国科学院" item["web"] = response.meta["laiyuan"] # item["keyword"] = "" item["web_id"] = 17 contents = "".join( response.xpath('\ //div[@class="TRS_Editor"]/font/font/span/p/text()|\ //div[@class="TRS_Editor"]/font/font/span/p/span/text()|\ //div[@class="TRS_Editor"]/font/font/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/font/font/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/p/span/font/strong/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/p/strong/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/span/text()|\ //div[@class="TRS_Editor"]/span/span/text()|\ //div[@class="TRS_Editor"]/span/font/p/span/font/text()|\ //div[@class="TRS_Editor"]/span/font/p/span/font/span/text()|\ //div[@class="TRS_Editor"]/span/h2/span/text()|\ //div[@class="TRS_Editor"]/span/p/text()|\ //div[@class="TRS_Editor"]/span/p/span/text()|\ //div[@class="TRS_Editor"]/span/strong/span/span/p/strong/text()|\ //div[@class="TRS_Editor"]/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/b/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/b/span/span/p/b/span/text()|\ //div[@class="TRS_Editor"]/strong/font/p/text()|\ //div[@class="TRS_Editor"]/strong/font/p/a/text()|\ //div[@class="TRS_Editor"]/strong/font/p/a/font/text()|\ //div[@class="TRS_Editor"]/p/text()|\ //div[@class="TRS_Editor"]/p/a/text()|\ //div[@class="TRS_Editor"]/p/b/text()|\ //div[@class="TRS_Editor"]/p/b/span/text()|\ //div[@class="TRS_Editor"]/p/a/span/text()|\ //div[@class="TRS_Editor"]/p/span/a/span/text()|\ //div[@class="TRS_Editor"]/p/span/text()|\ //div[@class="TRS_Editor"]/p/span/span/text()|\ //div[@class="TRS_Editor"]/p/strong/text()|\ //div[@class="TRS_Editor"]/p/strong/font/text()|\ //div[@class="TRS_Editor"]/p/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/text()|\ //div[@class="TRS_Editor"]/p/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/span/font/text()|\ //div[@class="TRS_Editor"]/p/font/strong/text()|\ //div[@class="TRS_Editor"]/p/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/font/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/strong/text()|\ //div[@class="TRS_Editor"]/p/span/font/text()|\ //div[@class="TRS_Editor"]/text()|\ //div[@class="TRS_Editor"]/a/text()|\ //div[@class="TRS_Editor"]/font/text()|\ //div[@class="TRS_Editor"]/div/a/text()|\ //div[@class="TRS_Editor"]/div/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/text()|\ //div[@class="TRS_Editor"]/div/font/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/sup/text()|\ //div[@class="TRS_Editor"]/div/font/sup/span/text()|\ //div[@class="TRS_Editor"]/div/font/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/b/span/text()|\ //div[@class="TRS_Editor"]/div/p/b/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/text()|\ //div[@class="TRS_Editor"]/div/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/p/a/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/a/text()|\ //div[@class="TRS_Editor"]/div/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/span/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/span/span/text()|\ //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/text()|\ //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/a/span/text()|\ //div[@class="TRS_Editor"]/div/text()|\ //div[@class="TRS_Editor"]/div/div/text()|\ //div[@class="TRS_Editor"]/div/div/p/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/p/text()|\ //div[@class="TRS_Editor"]/div/div/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/span/p/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/div/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/div/div/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/div/div/p/text()|\ //font[@face="Calibri"]/text()|\ //font[@face="Calibri"]/span/text()|\ //font[@face="Calibri"]/span/span/text()|\ //*[@id="p-detail"]/p/text()|\ //*[@id="p-detail"]/p/font/text()|\ //*[@id="p-detail"]/p/font/strong/text()|\ //*[@id="p-detail"]/p/font/span/text()|\ //*[@id="p-detail"]/div/p/text()|\ //*[@id="ozoom"]/p/text()|\ //*[@id="zoom"]/div/p/text()|\ //*[@id="zoom"]/div/p/span/text()|\ //*[@id="zoom"]/strong/span/p/strong/text()|\ //*[@id="zoom"]/p/text()|\ //*[@id="zoom"]/p/a/text()|\ //*[@id="zoom"]/p/strong/text()|\ //*[@id="zoom"]/p/span/text()|\ //*[@id="zoom"]/span/p/text()|\ //*[@id="zoom"]/span/p/a/text()|\ //*[@id="zoom"]/span/p/a/font/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/strong/span/p/span/strong/span/span/strong/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/span/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/text()|\ //*[@id="zoom"]/span/strong/span/span/p/strong/text()' ).extract()) if contents == "": contents = "可能是图片或表格 打开原网站查看" item["content"] = contents.replace("\u3000", "").replace("\xa0", "").replace( "\u200b", "") item["keyword"] = keyword.get_keyword(item["content"]) return item
def get_detail(self, response): item = FagaiweiItem() item["url"] = response.url date = "".join( response.xpath( '//div[@class="article__heading"]/div/div/span/text()'). extract()) pub_time = date.split("\n")[0] if pub_time: item["pub_time"] = pub_time else: item["pub_time"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["title"] = response.meta["title"] item["webname"] = "华尔街见闻" item["web"] = response.meta["laiyuan"] item["web_id"] = 46 contents = "".join( response.xpath('//div[@class="article__content"]/div/div/text()|\ //div[@class="article__content"]/div/div/p/text()|\ //div[@class="article__content"]/div/div/p/strong/text()|\ //div[@class="article__content"]/div/div/h2/text()|\ //div[@class="article__content"]/div/div/h2/strong/text()|\ //div[@class="article__content"]/div/p/text()|\ //div[@class="article__content"]/div/p/strong/text()|\ //div[@class="article__content"]/div/h2/text()|\ //div[@class="pa-main__content preview"]/p/text()|\ //div[@class="pa-main__content preview"]/p/strong/text()|\ //div[@class="pa-main__content"]/p/text()|\ //div[@class="pa-main__content"]/p/span/text()|\ //div[@class="pa-main__content"]/p/span/span/text()|\ //div[@class="pa-main__content"]/p/span/strong/text()|\ //div[@class="pa-main__content"]/p/span/strong/span/text()|\ //div[@class="article__content"]/div/div/h2/p/text()' ).extract()) # print(contents) if contents != "": item["content"] = contents.replace("\u3000", "").replace("\xa0", "") else: u = response.url article_id = u.split('/')[-1] url = "https://api-prod.wallstreetcn.com/apiv1/content/articles/{}?extract=0".format( article_id) # url = "https://api-prod.wallstreetcn.com/apiv1/content/articles/3297387?extract=0" res = requests.get(url, headers=DEFAULT_REQUEST_HEADERS) result = res.json() ress = result["data"]["content"] tree = etree.HTML(ress) content_sss = "".join( tree.xpath('//p/text()|//span/text()|//strong/text()')) # print(content_sss) if content_sss: item["content"] = content_sss else: item["content"] = "可能是图片 请打开详情页查看" item["keyword"] = keyword.get_keyword(item["content"]) if item["url"] == item["web"]: pass else: # pass # print(item) return item
def get_detail(self, response): item = FagaiweiItem() item["url"] = response.url item["title"] = response.meta["title"] item["pub_time"] = response.meta["date"] contents = "".join( response.xpath('\ //div[@class="TRS_Editor"]/font/font/span/p/text()|\ //div[@class="TRS_Editor"]/font/font/span/p/span/text()|\ //div[@class="TRS_Editor"]/font/font/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/font/font/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/p/span/font/strong/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/p/strong/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/span/text()|\ //div[@class="TRS_Editor"]/span/font/text()|\ //div[@class="TRS_Editor"]/span/span/text()|\ //div[@class="TRS_Editor"]/span/font/p/span/font/text()|\ //div[@class="TRS_Editor"]/span/font/p/span/font/span/text()|\ //div[@class="TRS_Editor"]/span/h2/span/text()|\ //div[@class="TRS_Editor"]/span/p/text()|\ //div[@class="TRS_Editor"]/span/p/span/text()|\ //div[@class="TRS_Editor"]/span/strong/span/span/p/strong/text()|\ //div[@class="TRS_Editor"]/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/b/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/b/span/span/p/b/span/text()|\ //div[@class="TRS_Editor"]/strong/font/p/text()|\ //div[@class="TRS_Editor"]/strong/font/p/a/text()|\ //div[@class="TRS_Editor"]/strong/font/p/a/font/text()|\ //div[@class="TRS_Editor"]/p/text()|\ //div[@class="TRS_Editor"]/p/a/text()|\ //div[@class="TRS_Editor"]/p/b/text()|\ //div[@class="TRS_Editor"]/p/b/span/text()|\ //div[@class="TRS_Editor"]/p/a/span/text()|\ //div[@class="TRS_Editor"]/p/u/span/a/text()|\ //div[@class="TRS_Editor"]/p/span/a/span/text()|\ //div[@class="TRS_Editor"]/p/span/text()|\ //div[@class="TRS_Editor"]/p/span/span/text()|\ //div[@class="TRS_Editor"]/p/strong/text()|\ //div[@class="TRS_Editor"]/p/strong/font/text()|\ //div[@class="TRS_Editor"]/p/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/text()|\ //div[@class="TRS_Editor"]/p/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/span/font/text()|\ //div[@class="TRS_Editor"]/p/font/strong/text()|\ //div[@class="TRS_Editor"]/p/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/font/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/strong/text()|\ //div[@class="TRS_Editor"]/p/span/font/text()|\ //div[@class="TRS_Editor"]/text()|\ //div[@class="TRS_Editor"]/a/text()|\ //div[@class="TRS_Editor"]/font/text()|\ //div[@class="TRS_Editor"]/div/text()|\ //div[@class="TRS_Editor"]/div/a/text()|\ //div[@class="TRS_Editor"]/div/b/text()|\ //div[@class="TRS_Editor"]/div/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/text()|\ //div[@class="TRS_Editor"]/div/span/a/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/span/a/text()|\ //div[@class="TRS_Editor"]/div/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/text()|\ //div[@class="TRS_Editor"]/div/font/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/sup/text()|\ //div[@class="TRS_Editor"]/div/font/sup/span/text()|\ //div[@class="TRS_Editor"]/div/font/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/b/span/text()|\ //div[@class="TRS_Editor"]/div/p/b/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/text()|\ //div[@class="TRS_Editor"]/div/p/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/font/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/a/strong/text()|\ //div[@class="TRS_Editor"]/div/p/a/span/text()|\ //div[@class="TRS_Editor"]/div/p/a/strong/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/span/a/text()|\ //div[@class="TRS_Editor"]/div/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/span/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/span/span/text()|\ //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/text()|\ //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/a/span/text()|\ //div[@class="TRS_Editor"]/div/text()|\ //div[@class="TRS_Editor"]/div/div/text()|\ //div[@class="TRS_Editor"]/div/div/p/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/font/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/span/font/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/p/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/span/p/text()|\ //div[@class="cen_main"]/div/h1/text()|\ //div[@class="cen_main"]/div/div/p/text()|\ //div[@class="cen_main"]/div/div/p/span/text()|\ //div[@class="cen_main"]/div/div/p/span/span/text()|\ //div[@class="cen_main"]/div/div/div/p/span/text()|\ //div[@class="cen_main"]/div/div/div/p/span/span/text()|\ //font[@face="Calibri"]/text()|\ //font[@face="Calibri"]/span/text()|\ //font[@face="Calibri"]/span/span/text()|\ //*[@id="ozoom"]/p/text()|\ //*[@id="zoom"]/div/p/text()|\ //*[@id="zoom"]/div/p/span/text()|\ //*[@id="zoom"]/strong/span/p/strong/text()|\ //*[@id="zoom"]/p/text()|\ //*[@id="zoom"]/p/a/text()|\ //*[@id="zoom"]/p/strong/text()|\ //*[@id="zoom"]/p/span/text()|\ //*[@id="zoom"]/span/p/text()|\ //*[@id="zoom"]/span/p/a/text()|\ //*[@id="zoom"]/span/p/a/font/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/strong/span/p/span/strong/span/span/strong/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/span/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/text()|\ //*[@id="zoom"]/span/strong/span/span/p/strong/text()' ).extract()) if contents == "": contents = "可能是图片或表格 打开原网站查看" item["content"] = contents.replace("\u3000", "").replace("\xa0", "") fo = "".join(response.xpath('//td[@class="Gray12"]/text()').extract()) fo_s = fo.replace("\n", "").replace("\r", "").replace("\t", "").replace(" ", "") form_s = "".join(re.findall(re.compile(r'来源:(.*?)分享'), fo_s)) # print(form_s) if form_s == "": form_s = "中华人民共和国自然资源部" item["webname"] = form_s item["web"] = response.meta["laiyuan"] item["web_id"] = 8 item["keyword"] = keyword.get_keyword(item["content"]) # print(item) return item
def parse(self, response): # print(response.url) if response.url == "http://live.nbd.com.cn/": message_list = response.xpath('//ul[@class="live-list"]/li') # print(len(message_list)) for message in message_list[:10]: item = FagaiweiItem() date = "".join(message.xpath('div[1]/p/span/text()').extract()) content = "".join(message.xpath('div[2]/a/text()').extract()) href = "".join(message.xpath('div[2]/a/@href').extract()) days = time.strftime('%Y-%m-%d', time.localtime(time.time())) date = days + " " + date try: dates = datetime.datetime.strptime(str(date), '%Y-%m-%d %H:%M:%S') except: dates = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print(dates, content, href) result = session.query(NewsItemInfo).filter_by(url=href, web_id=26).count() if result: # print("{} 存在".format(href)) pass else: item["url"] = href item["pub_time"] = dates item["title"] = content[:30] item["webname"] = "每经网" item["web"] = response.url item["web_id"] = 26 item["content"] = content # print(item) item["keyword"] = keyword.get_keyword(item["content"]) yield item else: message_list = response.xpath('//ul[@class="m-columnnews-list"]/li|\ //ul[@class="mt-ul"]/li|\ //ul[@class="u-news-list"]/li') # print(len(message_list)) for message in message_list: date_1 = "".join(message.xpath('//p[@class="u-channeltime"]/text()').extract()) date_1 = date_1.replace(" ", "").replace("\n", "").replace("\t", "").replace("\r", "") date = "".join(message.xpath('div/div/p/span[2]/text()|div/p[2]/text()|span/text()').extract()) title = "".join(message.xpath('div/div/a[1]/text()|div/a/text()|a/text()').extract()) href = "".join(message.xpath('div/div/a/@href').extract()) if not href: href = "".join(message.xpath('div/a/@href').extract()) if not href: href = "".join(message.xpath('a/@href').extract()) date = date.replace(" ", "").replace("\n", "").replace("\t", "").replace("\r", "") date = date_1 + " " + date # print(date) try: date = datetime.datetime.strptime(str(date).replace('-', '-'), '%Y-%m-%d %H:%M:%S') except Exception as e: date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) url = href result = session.query(NewsItemInfo).filter_by(url=url.replace("#", ""), web_id=26).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.get_detail, meta={"date": date, "title": title, "laiyuan": response.url})
def get_detail_cma(self, response): item = FagaiweiItem() item["url"] = response.url item["title"] = response.meta["title"] contents = "".join( response.xpath('\ //div[@class="TRS_Editor"]/font/font/span/p/text()|\ //div[@class="TRS_Editor"]/font/font/span/p/span/text()|\ //div[@class="TRS_Editor"]/font/font/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/font/font/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/p/span/font/strong/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/p/strong/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/span/text()|\ //div[@class="TRS_Editor"]/span/font/text()|\ //div[@class="TRS_Editor"]/span/span/text()|\ //div[@class="TRS_Editor"]/span/font/p/span/font/text()|\ //div[@class="TRS_Editor"]/span/font/p/span/font/span/text()|\ //div[@class="TRS_Editor"]/span/h2/span/text()|\ //div[@class="TRS_Editor"]/span/p/text()|\ //div[@class="TRS_Editor"]/span/p/span/text()|\ //div[@class="TRS_Editor"]/span/strong/span/span/p/strong/text()|\ //div[@class="TRS_Editor"]/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/b/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/b/span/span/p/b/span/text()|\ //div[@class="TRS_Editor"]/strong/font/p/text()|\ //div[@class="TRS_Editor"]/strong/font/p/a/text()|\ //div[@class="TRS_Editor"]/strong/font/p/a/font/text()|\ //div[@class="TRS_Editor"]/p/text()|\ //div[@class="TRS_Editor"]/p/a/text()|\ //div[@class="TRS_Editor"]/p/b/text()|\ //div[@class="TRS_Editor"]/p/b/span/text()|\ //div[@class="TRS_Editor"]/p/a/span/text()|\ //div[@class="TRS_Editor"]/p/u/span/a/text()|\ //div[@class="TRS_Editor"]/p/span/a/span/text()|\ //div[@class="TRS_Editor"]/p/span/text()|\ //div[@class="TRS_Editor"]/p/span/span/text()|\ //div[@class="TRS_Editor"]/p/strong/text()|\ //div[@class="TRS_Editor"]/p/strong/font/text()|\ //div[@class="TRS_Editor"]/p/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/text()|\ //div[@class="TRS_Editor"]/p/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/span/font/text()|\ //div[@class="TRS_Editor"]/p/font/strong/text()|\ //div[@class="TRS_Editor"]/p/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/font/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/strong/text()|\ //div[@class="TRS_Editor"]/p/span/font/text()|\ //div[@class="TRS_Editor"]/text()|\ //div[@class="TRS_Editor"]/a/text()|\ //div[@class="TRS_Editor"]/font/text()|\ //div[@class="TRS_Editor"]/div/text()|\ //div[@class="TRS_Editor"]/div/a/text()|\ //div[@class="TRS_Editor"]/div/b/text()|\ //div[@class="TRS_Editor"]/div/span/text()|\ //div[@class="TRS_Editor"]/div/span/a/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/span/a/text()|\ //div[@class="TRS_Editor"]/div/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/text()|\ //div[@class="TRS_Editor"]/div/font/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/sup/text()|\ //div[@class="TRS_Editor"]/div/font/sup/span/text()|\ //div[@class="TRS_Editor"]/div/font/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/b/span/text()|\ //div[@class="TRS_Editor"]/div/p/b/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/text()|\ //div[@class="TRS_Editor"]/div/p/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/font/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/a/strong/text()|\ //div[@class="TRS_Editor"]/div/p/a/span/text()|\ //div[@class="TRS_Editor"]/div/p/a/strong/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/span/a/text()|\ //div[@class="TRS_Editor"]/div/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/span/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/span/span/text()|\ //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/text()|\ //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/a/span/text()|\ //div[@class="TRS_Editor"]/div/text()|\ //div[@class="TRS_Editor"]/div/div/text()|\ //div[@class="TRS_Editor"]/div/div/p/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/font/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/span/font/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/span/p/text()|\ //div[@class="cen_main"]/div/h1/text()|\ //div[@class="cen_main"]/div/div/p/text()|\ //div[@class="cen_main"]/div/div/p/span/text()|\ //div[@class="cen_main"]/div/div/p/span/span/text()|\ //div[@class="cen_main"]/div/div/div/p/span/text()|\ //div[@class="cen_main"]/div/div/div/p/span/span/text()|\ //font[@face="Calibri"]/text()|\ //font[@face="Calibri"]/span/text()|\ //font[@face="Calibri"]/span/span/text()|\ //*[@id="ozoom"]/p/text()|\ //*[@id="zoom"]/div/p/text()|\ //*[@id="zoom"]/div/p/span/text()|\ //*[@id="zoom"]/strong/span/p/strong/text()|\ //*[@id="zoom"]/p/text()|\ //*[@id="zoom"]/p/a/text()|\ //*[@id="zoom"]/p/strong/text()|\ //*[@id="zoom"]/p/span/text()|\ //*[@id="zoom"]/span/p/text()|\ //*[@id="zoom"]/span/p/a/text()|\ //*[@id="zoom"]/span/p/a/font/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/strong/span/p/span/strong/span/span/strong/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/span/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/text()|\ //*[@id="zoom"]/span/strong/span/span/p/strong/text()' ).extract()) if contents == "": contents = "可能是图片或表格 打开原网站查看" item["content"] = contents.replace("\u3000", "").replace("\xa0", "") \ .replace("\t\n", "").replace("\t", "") # .replace(" ", "") form_s = "".join( response.xpath( '//div[@class="news_textspan"]/div/span[1]/text()').extract()) item["webname"] = form_s.replace("来源:", "") item["web"] = response.meta["laiyuan"] date = "".join( response.xpath( '//div[@class="news_textspan"]/div/span[2]/text()').extract()) date = date[:-5].replace("发布时间:", "").replace("年", "-").replace( "月", "-").replace("日", "") try: date = datetime.datetime.strptime( str(date).replace('/', '-'), '%Y-%m-%d') # print(date) except Exception as e: # print(e) date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["pub_time"] = date # print(item) item["web_id"] = 3 item["keyword"] = keyword.get_keyword(item["content"]) return item
def parse_fagaiwei(response, item): item = FagaiweiItem() item["url"] = response.url contents = "".join( response.xpath('-\ //div[@class="TRS_Editor"]/font/font/span/p/text()|\ //div[@class="TRS_Editor"]/font/font/span/p/span/text()|\ //div[@class="TRS_Editor"]/font/font/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/span/span/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/font/font/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/p/span/font/strong/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/p/strong/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/span/text()|\ //div[@class="TRS_Editor"]/span/span/text()|\ //div[@class="TRS_Editor"]/span/font/p/span/font/text()|\ //div[@class="TRS_Editor"]/span/font/p/span/font/span/text()|\ //div[@class="TRS_Editor"]/span/h2/span/text()|\ //div[@class="TRS_Editor"]/span/p/text()|\ //div[@class="TRS_Editor"]/span/p/span/text()|\ //div[@class="TRS_Editor"]/span/strong/span/span/p/strong/text()|\ //div[@class="TRS_Editor"]/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/b/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/b/span/span/p/b/span/text()|\ //div[@class="TRS_Editor"]/strong/font/p/text()|\ //div[@class="TRS_Editor"]/strong/font/p/a/text()|\ //div[@class="TRS_Editor"]/strong/font/p/a/font/text()|\ //div[@class="TRS_Editor"]/p/text()|\ //div[@class="TRS_Editor"]/p/a/text()|\ //div[@class="TRS_Editor"]/p/b/text()|\ //div[@class="TRS_Editor"]/p/b/span/text()|\ //div[@class="TRS_Editor"]/p/a/span/text()|\ //div[@class="TRS_Editor"]/p/span/a/span/text()|\ //div[@class="TRS_Editor"]/p/span/text()|\ //div[@class="TRS_Editor"]/p/span/span/text()|\ //div[@class="TRS_Editor"]/p/strong/text()|\ //div[@class="TRS_Editor"]/p/strong/font/text()|\ //div[@class="TRS_Editor"]/p/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/text()|\ //div[@class="TRS_Editor"]/p/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/span/font/text()|\ //div[@class="TRS_Editor"]/p/font/strong/text()|\ //div[@class="TRS_Editor"]/p/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/font/a/text()|\ //div[@class="TRS_Editor"]/p/font/font/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/a/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/font/font/font/text()|\ //div[@class="TRS_Editor"]/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/p/font/font/strong/text()|\ //div[@class="TRS_Editor"]/p/span/font/text()|\ //div[@class="TRS_Editor"]/text()|\ //div[@class="TRS_Editor"]/a/text()|\ //div[@class="TRS_Editor"]/font/text()|\ //div[@class="TRS_Editor"]/div/a/text()|\ //div[@class="TRS_Editor"]/div/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/span/span/span/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/text()|\ //div[@class="TRS_Editor"]/div/font/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/sup/text()|\ //div[@class="TRS_Editor"]/div/font/sup/span/text()|\ //div[@class="TRS_Editor"]/div/font/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/font/span/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/b/span/text()|\ //div[@class="TRS_Editor"]/div/p/b/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/text()|\ //div[@class="TRS_Editor"]/div/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/p/a/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/a/text()|\ //div[@class="TRS_Editor"]/div/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/p/span/font/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/p/span/span/span/text()|\ //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/text()|\ //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/a/span/text()|\ //div[@class="TRS_Editor"]/div/text()|\ //div[@class="TRS_Editor"]/div/div/text()|\ //div[@class="TRS_Editor"]/div/div/p/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/span/font/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/span/text()|\ //div[@class="TRS_Editor"]/div/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/span/p/font/font/span/text()|\ //div[@class="TRS_Editor"]/div/div/div/div/span/p/text()|\ //font[@face="Calibri"]/text()|\ //font[@face="Calibri"]/span/text()|\ //font[@face="Calibri"]/span/span/text()|\ //div[@class="txt1"]/text()|\ //div[@class="txt1"]/a/text()|\ //div[@class="txt1"]/a/font/text()|\ //*[@id="ozoom"]/p/text()|\ //*[@id="zoom"]/div/p/text()|\ //*[@id="zoom"]/div/p/span/text()|\ //*[@id="zoom"]/strong/span/p/strong/text()|\ //*[@id="zoom"]/p/text()|\ //*[@id="zoom"]/p/a/text()|\ //*[@id="zoom"]/p/strong/text()|\ //*[@id="zoom"]/p/span/text()|\ //*[@id="zoom"]/span/p/text()|\ //*[@id="zoom"]/span/p/a/text()|\ //*[@id="zoom"]/span/p/a/font/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/strong/span/p/span/strong/span/span/strong/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/span/text()|\ //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/text()|\ //*[@id="zoom"]/span/strong/span/span/p/strong/text()' ).extract()) if contents == "": contents = "可能是图片或表格 打开原网站查看" item["content"] = contents item["pub_time"] = response.meta["date"] item["title"] = response.meta["title"] from_s = "".join( response.xpath('//*[@id="dSourceText"]/a/text()').extract()) from_s_url = "".join( response.xpath('//*[@id="dSourceText"]/a/@href').extract()) if from_s == "": webname = "发改委" depart_url = response.meta["laiyuan"] else: webname = from_s depart_url = from_s_url item["webname"] = webname item["web"] = depart_url item["keyword"] = keyword.get_keyword(item["content"]) item["web_id"] = 2 yield item