def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log_line('请求状态不是200') return 'error' response = etree.HTML(html.text) self.parse_item(response) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'bjjrj' return news
def get_iteminfo(self, url): ''' 访问每一条新闻详情 :param itemlist: 新闻链接集合 :return: 新闻model ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'xinhua' return news
def get_newsinfo(self, url, parser_item_fuc): ''' 请求每一个新闻详情 ''' t_sleep() log('当前访问的URL', url) html = self.get_html(url) if html == 'timeout': return 'error' response = etree.HTML(html.text) log('当前访问的URL', url, html.status_code) if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' # parser_item_fuc(response) title, date, content = parser_item_fuc(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'pbc' return news
def parser_item(self, item): news = News() news.spider_name = 'amac' news.url = self.parser_url( item.xpath('./@href')[0], 'http://www.amac.org.cn') news.title = item.xpath('./text()')[0] self.newslist.append(news)
def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'gbk' except Exception as e: log_line('访问出错') print(e) return 'timeout' response = etree.HTML(html.text) log('当前访问的URL', url) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) return news
def select_url(self): try: connect = get_connect() cursor = connect.cursor() print("connection") sql = "SELECT id,source_url FROM toutiao_news where id >(select min(t.id) from (select id from toutiao_news where content is null order by id) t) order by id desc " cursor.execute(sql) result = cursor.fetchall() for row in result: # print(row[0]) news = News() news.id = row[0] news.source_url = row[1] self.news_list.append(news) except Exception as e: print(e)
def parse_item(self, response, url): try: title = (response.xpath('//h2[@class="titl"]/text()'))[0].strip() except Exception as e: title = '未知' try: date = (response.xpath('//p[@class="Wh"]/span[1]/text()') )[0].strip().split()[0] date = str(arrow.get(date)).split('T')[0] except Exception as e: date = '未知' try: con_list = response.xpath('//div[@class="detailCont"]/p') content = self.pasre_content(con_list) except Exception as e: content = '未知' item = News() item.title = title item.date = date item.content = content item.url = url item.spider_name = 'jingji' return item
def select_url(): arrList = [] try: connect = get_connect() cursor = connect.cursor() print("connection") sql = "SELECT id,source_url FROM toutiao_news WHERE id > 15855" cursor.execute(sql) result = cursor.fetchall() for row in result: # print(row[0]) news = News() news.id = row[0] news.source_url = row[1] arrList.append(news) except Exception as e: print(e) finally: return arrList
def toutiao_news_api(url): # 代理服务器 proxyHost = "http-dyn.abuyun.com" proxyPort = "9020" # 代理隧道验证信息 proxyUser = "******" proxyPass = "******" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } proxies = { "http": proxyMeta, "https": proxyMeta, } ua = UserAgent(verify_ssl=False) headers = { 'cookie': 'tt_webid=6825236887406953998; s_v_web_id=verify_ka17kc91_J51hfIgB_1Ujy_4F87_AQ77_v44SCeaZdYbb; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __tasessionId=ftj73c94a1589124278466; tt_webid=6825236887406953998; csrftoken=3bc73a541ff3c196706a5fa652baa10a; ttcid=93c87bb6d2c44204a824c060f2a0344b39; SLARDAR_WEB_ID=167cd898-158d-4682-84b7-515f808f9c49; tt_scid=nvrgh8BUDb5bfXypX.EbNgFcMiVjrSr7vdwnPAab2w2tEn2I8DLcdmqRb2aAGGvT6b9b', 'user-agent': ua.random, 'x-requested-with': 'XMLHttpRequest' } toutiao_data = requests.get(url, headers=headers, proxies=proxies).text global data data = json.loads(toutiao_data) global max_behot_time max_behot_time = data['next']['max_behot_time'] items = data['data'] news_list = [] link_head = 'http://toutiao.com' for n in items: if 'title' in n and n['tag'] != 'ad' and n['tag'] != 'news_media': news = News() news.title = n['title'] print(news.title) news.tag = n['tag'] news.source = n['source'] # 转换成localtime time_local = time.localtime(n['behot_time']) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) news.news_date = dt print(news.news_date) news.source_url = link_head + n['source_url'] news_list.append(news) #print(news.title, news.source_url, news.source, news.keyword, news.keywords) return news_list
def parser_item(self, item): news = News() news.spider_name = 'mohurd' news.url = item.xpath('./@href')[0] news.title = item.xpath('./text()')[0] news.date = item.getparent().getnext().xpath( './text()')[0][1:-1].replace('.', '-').strip() self.newslist.append(news)
def parser_item(self, item): url = item.xpath('./li[@class="mc"]/div/a/@href')[0] date = item.xpath('./li[@class="fbrq"]/text()')[0] news = News() news.spider_name = 'csrc' news.url = self.parser_url(url, 'http://www.csrc.gov.cn/pub/zjhpublic') news.title = item.xpath('./li[@class="mc"]/div/a/text()')[0] news.date = arrow.get(date).format('YYYY-MM-DD') # log(news.url, news.title, news.date) self.newslist.append(news)
def parser_item(self, item): url = item.xpath('./a/@href')[0] date = item.xpath('./span/text()')[0] news = News() news.spider_name = 'circ' news.url = self.parser_url(url, 'http://www.gov.cn') news.title = item.xpath('./a/text()')[0] news.date = date # log(news.url, news.title, news.date) self.newslist.append(news)
def parser_item(self, item): url = item.xpath('./a/@href')[0] if 'search' in url: return date = item.getnext().xpath('./text()')[0][1:-1] news = News() news.spider_name = 'circ' news.url = self.parser_url(url, 'http://www.circ.gov.cn') news.title = item.xpath('./a/text()')[0] news.date = date # log(news.url, news.title, news.date) self.newslist.append(news)
def get_html(self, url): html = requests.get(url, headers=self.get_news_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) items = html.xpath('//a[@class="STYLE8"]') for item in items: news = News() news.spider_name = 'cbrc' news.url = item.xpath('./@href')[0] news.title = item.xpath('./@title')[0] news.date = item.getparent().getnext().xpath('./text()')[0].strip() self.newslist.append(news) return self.parser_url(self.newslist)
def keyword_search(keyword): source_url_list = select_source_url_returnset() url = 'http://www.toutiao.com/search_content/?offset=0&format=json&keyword= ' + keyword + '&autoload=true&count=200&cur_tab=1' toutiao_data = requests.get(url).text data = json.loads(toutiao_data) items = data['data'] news_list = [] link_head = 'http://toutiao.com' for n in items: if 'title' in n: news = News() news.title = n['title'] news.tag = n['tag'] news.source = n['source'] news.source_url = link_head + n['source_url'] # 两会关键词 news.keyword = keyword # 今日头条自带关键词 news.keywords = n['keywords'] #如果已经存在source_url则跳过 if news.source_url in source_url_list: print('数据库已有该记录!') continue print('新添加记录:', news.title) news_list.append(news) # print(news.title, news.source_url, news.source, news.keyword, news.keywords) return news_list