def parse_item(self, response, url): try: title = (response.xpath('//h2[@class="titl"]/text()'))[0].strip() except Exception as e: title = '未知' try: date = (response.xpath('//p[@class="Wh"]/span[1]/text()') )[0].strip().split()[0] date = str(arrow.get(date)).split('T')[0] except Exception as e: date = '未知' try: con_list = response.xpath('//div[@class="detailCont"]/p') content = self.pasre_content(con_list) except Exception as e: content = '未知' item = News() item.title = title item.date = date item.content = content item.url = url item.spider_name = 'jingji' return item
def parser_item(self, item): news = News() news.spider_name = 'amac' news.url = self.parser_url( item.xpath('./@href')[0], 'http://www.amac.org.cn') news.title = item.xpath('./text()')[0] self.newslist.append(news)
def parser_item(self, item): news = News() news.spider_name = 'mohurd' news.url = item.xpath('./@href')[0] news.title = item.xpath('./text()')[0] news.date = item.getparent().getnext().xpath( './text()')[0][1:-1].replace('.', '-').strip() self.newslist.append(news)
def toutiao_news_api(url): # 代理服务器 proxyHost = "http-dyn.abuyun.com" proxyPort = "9020" # 代理隧道验证信息 proxyUser = "******" proxyPass = "******" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } proxies = { "http": proxyMeta, "https": proxyMeta, } ua = UserAgent(verify_ssl=False) headers = { 'cookie': 'tt_webid=6825236887406953998; s_v_web_id=verify_ka17kc91_J51hfIgB_1Ujy_4F87_AQ77_v44SCeaZdYbb; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __tasessionId=ftj73c94a1589124278466; tt_webid=6825236887406953998; csrftoken=3bc73a541ff3c196706a5fa652baa10a; ttcid=93c87bb6d2c44204a824c060f2a0344b39; SLARDAR_WEB_ID=167cd898-158d-4682-84b7-515f808f9c49; tt_scid=nvrgh8BUDb5bfXypX.EbNgFcMiVjrSr7vdwnPAab2w2tEn2I8DLcdmqRb2aAGGvT6b9b', 'user-agent': ua.random, 'x-requested-with': 'XMLHttpRequest' } toutiao_data = requests.get(url, headers=headers, proxies=proxies).text global data data = json.loads(toutiao_data) global max_behot_time max_behot_time = data['next']['max_behot_time'] items = data['data'] news_list = [] link_head = 'http://toutiao.com' for n in items: if 'title' in n and n['tag'] != 'ad' and n['tag'] != 'news_media': news = News() news.title = n['title'] print(news.title) news.tag = n['tag'] news.source = n['source'] # 转换成localtime time_local = time.localtime(n['behot_time']) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) news.news_date = dt print(news.news_date) news.source_url = link_head + n['source_url'] news_list.append(news) #print(news.title, news.source_url, news.source, news.keyword, news.keywords) return news_list
def parser_item(self, item): url = item.xpath('./li[@class="mc"]/div/a/@href')[0] date = item.xpath('./li[@class="fbrq"]/text()')[0] news = News() news.spider_name = 'csrc' news.url = self.parser_url(url, 'http://www.csrc.gov.cn/pub/zjhpublic') news.title = item.xpath('./li[@class="mc"]/div/a/text()')[0] news.date = arrow.get(date).format('YYYY-MM-DD') # log(news.url, news.title, news.date) self.newslist.append(news)
def parser_item(self, item): url = item.xpath('./a/@href')[0] date = item.xpath('./span/text()')[0] news = News() news.spider_name = 'circ' news.url = self.parser_url(url, 'http://www.gov.cn') news.title = item.xpath('./a/text()')[0] news.date = date # log(news.url, news.title, news.date) self.newslist.append(news)
def parser_item(self, item): url = item.xpath('./a/@href')[0] if 'search' in url: return date = item.getnext().xpath('./text()')[0][1:-1] news = News() news.spider_name = 'circ' news.url = self.parser_url(url, 'http://www.circ.gov.cn') news.title = item.xpath('./a/text()')[0] news.date = date # log(news.url, news.title, news.date) self.newslist.append(news)
def get_html(self, url): html = requests.get(url, headers=self.get_news_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) items = html.xpath('//a[@class="STYLE8"]') for item in items: news = News() news.spider_name = 'cbrc' news.url = item.xpath('./@href')[0] news.title = item.xpath('./@title')[0] news.date = item.getparent().getnext().xpath('./text()')[0].strip() self.newslist.append(news) return self.parser_url(self.newslist)
def keyword_search(keyword): source_url_list = select_source_url_returnset() url = 'http://www.toutiao.com/search_content/?offset=0&format=json&keyword= ' + keyword + '&autoload=true&count=200&cur_tab=1' toutiao_data = requests.get(url).text data = json.loads(toutiao_data) items = data['data'] news_list = [] link_head = 'http://toutiao.com' for n in items: if 'title' in n: news = News() news.title = n['title'] news.tag = n['tag'] news.source = n['source'] news.source_url = link_head + n['source_url'] # 两会关键词 news.keyword = keyword # 今日头条自带关键词 news.keywords = n['keywords'] #如果已经存在source_url则跳过 if news.source_url in source_url_list: print('数据库已有该记录!') continue print('新添加记录:', news.title) news_list.append(news) # print(news.title, news.source_url, news.source, news.keyword, news.keywords) return news_list