def __init__(self): super(STCNBase, self).__init__() self.table_name = "stcn_info" info = utils.org_tablecode_map.get(self.table_name) self.name, self.table_code = info[0], info[1] self.extractor = GeneralNewsExtractor() self.fields = ['pub_date', 'code', 'title', 'link', 'article']
def get_novels_content(url, **kwargs): headers = { 'User-Agent': get_random_user_agent(), 'Referer': url } max_content = '' html, real_url, status = get_html_by_requests(headers=headers, url=url, **kwargs) netloc = get_netloc(real_url) if html: if netloc in RULES: soup = BeautifulSoup(html, 'html5lib') selector = RULES[netloc].content_selector if selector.get('id', None): content = soup.find_all(id=selector['id']) elif selector.get('class', None): content = soup.find_all(class_=selector['class']) else: content = soup.find_all(selector.get('tag')) if content: max_content = content[0].get_text() else: extractor = GeneralNewsExtractor() result = extractor.extract(html, with_body_html=True) max_content = result.get('content', '') for key in CONTENT_REPLACE: max_content = max_content.replace(key, CONTENT_REPLACE[key]) res_content = '\n'.join([i.strip() for i in max_content.split('\n') if i.strip()]) return res_content, status
def __init__(self): super(FK, self).__init__() self.list_url = 'http://finance.takungpao.com/fk/' self.extractor = GeneralNewsExtractor() self.table = 'takungpao' self.name = '风口' self.fields = ['link', 'title', 'pub_date', 'article']
def parse_item(self, response): item = HyNewsItem() resp = response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp, with_body_html=False) title = response.css('#zxwk_left_1 h2::text').extract_first() txt = result['content'] p_time = result['publish_time'] lyurl = response.url lyname = '石油在线' content_css = [ '#zxwk_left_1', ] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title item['txt'] = txt item['p_time'] = get_times(p_time) item['content'] = content item['spider_name'] = 'HY_SYZX' item['module_name'] = '石油在线' item['cate'] = '石油' item['region'] = '' item['code'] = '' item['link'] = lyurl item['website'] = lyname if content: yield item
def parse_item(self, response): ''' 实际解析页面根据页面实际情况进行解析 ''' item =XtyDataCollectItem() resp=response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp,with_body_html=False) title =result['title'] txt =result['title'] p_tiem =result['publish_time'] for conte in get_content_css(): try: content = response.css(conte).extact() if content: content = content break except: logging.warning('正文获取失败,请检查') appendix, appendix_name = get_attachments(response) item['title'] =title item['txt'] =txt item['p_tiem'] = get_times(p_tiem) item['content'] = content item['appendix']=appendix item['appendix_name']=appendix_name yield item
class GetPageContent(object): def __init__(self): self.headers = {} self.extractor = GeneralNewsExtractor() def static_page_dict(self, url: str) -> str: res = requests.get(url=url, headers=self.headers, timeout=15) detail_html = res.text return detail_html def extractor_html(self, detail_html): result = self.extractor.extract(detail_html) return result def extractor_html_code(self, detail_html): result = self.extractor.extract(detail_html, with_body_html=True) return result def extractor_html_abstract_path(self, detail_html, host=""): result = self.extractor.extract(detail_html, host=host) return result def extractor_html_noise_code(self, detail_html, title_xpath="", noise_node_list=""): result = self.extractor.extract(detail_html, title_xpath=title_xpath, noise_node_list=noise_node_list) return result
def parse_items(self, response): lyurl = response.url extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('#title::text').extract_first() txt = result['content'] publish_time = response.css('#info::text').extract_first() time = get_times(publish_time) item = HyxhItem() content_css = [ '#maininfo' ] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '浙江省船舶行业协会' item['website'] = '浙江省船舶行业协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 2 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'zhejaing_Aship2' item['module_name'] = '行业协会' yield item
def __init__(self): super(qqStock, self).__init__() self.extractor = GeneralNewsExtractor() self.token = "8f6b50e1667f130c10f981309e1d8200" self.list_url = "https://pacaio.match.qq.com/irs/rcd?cid=52&token={}" \ "&ext=3911,3922,3923,3914,3913,3930,3915,3918,3908&callback=__jp1".format(self.token) self.fields = []
def parse_item(self, response): item = HyNewsItem() resp = response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp, with_body_html=False) title = result['title'] txt = result['content'] p_time = result['publish_time'] lyurl = response.url lyname = '生意宝' content_css = [ '.zstexts', ] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') classify, codes, region = get_category(txt) item['title'] = title item['txt'] = txt item['p_time'] = get_times(p_time) item['content'] = content item['spider_name'] = 'HY_SYB' item['module_name'] = '行业新闻' item['cate'] = classify item['region'] = region item['code'] = codes item['link'] = lyurl item['website'] = lyname if content: yield item
def parse_items(self, response): lyurl = response.url extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('.nrBt01::text').extract_first() txt = result['content'] publish_time = response.xpath( '//*[@id="ctl00_main_panel3"]/table[2]/tr/td[1]/text()' ).extract_first() time = get_times(publish_time) item = HyxhItem() content_css = ['.nrTxt02'] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '中国海洋工程咨询协会' item['website'] = '中国海洋工程咨询协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'china_ocean' item['module_name'] = '行业协会' yield item
def parse(self, response): item = YanbItem() resp = response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp, with_body_html=False) title = result['title'] txt = result['content'] p_time = result['publish_time'] content_css = ['.body'] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') appendix, appendix_name = get_attachments(response) tags, _, _ = get_category(txt + title) industry = '' item['title'] = title item['p_time'] = get_times(str(p_time)) item['industry'] = industry item['appendix'] = appendix item['appendix_name'] = appendix_name item['content'] = ''.join(content) item['pub'] = '链塔' item['ctype'] = 3 item['website'] = '链塔' item['txt'] = ''.join(txt).strip() item['link'] = response.url item['spider_name'] = 'YB_LT' item['module_name'] = '研报' item['tags'] = tags if content: yield item
def parse_items(self, response): extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = result['title'] publish_time = result['publish_time'] time = get_times(publish_time) item = TRUCKItem() content_css = ['.details'] lyurl = response.url for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['source'] = '中国卡车网' item['website'] = '中国卡车网' item['link'] = lyurl item['type'] = 1 item['time'] = time item['content'] = content item['spider_name'] = 'ZGKCW' item['module_name'] = '产销库' yield item
def __init__(self): self.table = "stcn_info" self.local = LOCAL self.check_dt = datetime.datetime.today() - datetime.timedelta(days=2) self.dt_fmt = '%Y-%m-%d' # if self.local: # conf = { # "host": LOCAL_MYSQL_HOST, # "port": LOCAL_MYSQL_PORT, # "user": LOCAL_MYSQL_USER, # "password": LOCAL_MYSQL_PASSWORD, # "db": LOCAL_MYSQL_DB, # } # else: # conf = { # "host": MYSQL_HOST, # "port": MYSQL_PORT, # "user": MYSQL_USER, # "password": MYSQL_PASSWORD, # "db": MYSQL_DB, # } # self.sql_pool = PyMysqlPoolBase(**conf) # 默认是不需要翻页的 self.pages = False self.extractor = GeneralNewsExtractor()
def __init__(self): super(Reference, self).__init__() self.index_url = 'http://www.jfinfo.com/reference' self.more_url = 'http://www.jfinfo.com/articles_categories/more?page={}&category_id=13' self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Cookie': 'Hm_lvt_eb1542e7fa53709d9037dcc8c652e026=1583204516; Hm_lpvt_eb1542e7fa53709d9037dcc8c652e026=1583205739; _jfinfo_session=SzdiRTlIeUw5QXdObkRSNG5kUGpVRDNCQld3NGVkcTcrWnVNR3dZdTA4TWxoRVd3VENkQlBTeHcxQkdGaS9nUG9qdDVEeFlqMEI1OFdQMmdYNXJLTyt0YzJjRkRVbEVKa25YOUQvWUl5RjZFTm5WbENuN1JLZ05RSFR4cEVYVW90alhpSGNHSldiYWlZMDNXR0NuK293PT0tLWJwd2UybVpjREltRHB1bUxMdUxBZ2c9PQ%3D%3D--4ef0e46e0b2629bbf61194ceefd60e8b6b398499', 'Host': 'www.jfinfo.com', 'Pragma': 'no-cache', 'Referer': 'http://www.jfinfo.com/reference', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36', 'X-CSRF-Token': '9Kgn0ZaNQJWoqIht/pDIK9h97D5wuSFQ4gbSV8eB3eeXm3BVKjz7g8kDflyf0G4LssxDAOa0J297e6x5aKPndQ==', 'X-Requested-With': 'XMLHttpRequest', } self.extractor = GeneralNewsExtractor() self.table_name = 'jfinfo' # 巨丰资讯 self.fields = ['link', 'title', 'pub_date', 'article'] self.max_page = 2 self.name = '巨丰内参'
def __init__(self): self.list_url = "http://money.163.com/special/00251G8F/news_json.js" self.extractor = GeneralNewsExtractor() self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36", } self.local = LOCAL if self.local: conf = { "host": LOCAL_MYSQL_HOST, "port": LOCAL_MYSQL_PORT, "user": LOCAL_MYSQL_USER, "password": LOCAL_MYSQL_PASSWORD, "db": LOCAL_MYSQL_DB, } self.db = LOCAL_MYSQL_DB else: conf = { "host": MYSQL_HOST, "port": MYSQL_PORT, "user": MYSQL_USER, "password": MYSQL_PASSWORD, "db": MYSQL_DB, } self.db = MYSQL_DB self.sql_pool = PyMysqlPoolBase(**conf) self.table = "netease_money" self.error_detail = []
def parse_items(self, response): extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('h1::text').extract_first() txt = result['content'] publish_time = result['publish_time'] time = get_times(publish_time) item = HyxhItem() content_css = [ '.wof' ] lyurl = response.url for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '中国机械工业联合会' item['website'] = '中国机械工业联合会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'chinaFeature1' item['module_name'] = '行业协会' yield item
def __init__(self): super(CN4Hours, self).__init__() self.list_url = "http://app.cnstock.com/api/theme/get_theme_list?" self.extractor = GeneralNewsExtractor() self.table_name = "cn_stock" self.name = '上证四小时' self.fields = ['pub_date', 'title', 'link', 'article']
def __init__(self): super(HKStock_CJSS, self).__init__() self.page = 11 self.name = '财经时事' self.first_url = 'http://finance.takungpao.com/hkstock/cjss/index.html' self.format_url = "http://finance.takungpao.com/hkstock/cjss/index_{}.html" self.extractor = GeneralNewsExtractor() self.table = 'takungpao' self.fields = ['link', 'title', 'pub_date', 'article']
def __init__(self): super(NetEaseMoney, self).__init__() self.list_url = "http://money.163.com/special/00251G8F/news_json.js" self.extractor = GeneralNewsExtractor() self.fields = ['pub_date', 'title', 'link', 'article'] # self.name = '网易财经' self.table_name = "netease_money" info = utils.org_tablecode_map.get(self.table_name) self.name, self.table_code = info[0], info[1]
def __init__(self, format_url): super(Depth, self).__init__() self.this_last_dt = None self.name = '财联社-深度及题材' self.url_format = format_url self.table = 'cls_depth_theme' self.extractor = GeneralNewsExtractor() self.fields = ['title', 'link', 'pub_date', 'article'] self.error_detail = []
def __init__(self): super(EconomicObserver, self).__init__() self.name = '经济观察家' self.first_url = 'http://www.takungpao.com/finance/236134/index.html' self.format_url = 'http://www.takungpao.com/finance/236134/{}.html' self.page = 10 self.fields = ['pub_date', 'link', 'title', 'article'] self.table = 'takungpao' self.extractor = GeneralNewsExtractor()
def __init__(self): super(CN4Hours, self).__init__() self.list_url = "http://app.cnstock.com/api/theme/get_theme_list?" self.extractor = GeneralNewsExtractor() self.table_name = "cn_stock" self.fields = ['pub_date', 'title', 'link', 'article'] self.type = '上证四小时' info = utils.org_tablecode_map.get(self.table_name) self.name, self.table_code = info[0], info[1]
def __init__(self, name=None, urls=None, **kwargs): super().__init__(name, **kwargs) self.extractor = GeneralNewsExtractor() # self.urls = urls if urls is not None else ["https://tech.meituan.com/2013/12/04/yui3-practice.html"] if urls is None: raise ValueError("url 不能为空") self.urls = urls self.count = 1 MeituanArticleSpider.runing = True
def __init__(self): super(ZhongGuoJingJi, self).__init__() self.name = '中国经济' self.first_url = 'http://www.takungpao.com/finance/236132/index.html' self.format_url = 'http://www.takungpao.com/finance/236132/{}.html' self.page = 10 self.fields = ['pub_date', 'link', 'title', 'article'] self.table = 'takungpao' self.extractor = GeneralNewsExtractor()
def parse_from_html(html: str, url: str) -> dict: """根据 html 提取新闻内容, 该 url 仅用于返回(不做处理)""" extractor = GeneralNewsExtractor() try: result = extractor.extract(html, with_body_html=True) except Exception as e: raise Exception(f'Html parsing error, reason: {e}') result['url'] = url return result
def __init__(self): super(CCTVFinance, self).__init__() self.web_url = 'https://jingji.cctv.com/index.shtml' self.url = 'https://news.cctv.com/2019/07/gaiban/cmsdatainterface/page/economy_1.jsonp?cb=economy' self.extractor = GeneralNewsExtractor() self.fields = ['title', 'keywords', 'pub_date', 'brief', 'link', 'article'] self.table_name = 'cctvfinance' # self.name = '央视网-财经频道' info = utils.org_tablecode_map.get(self.table_name) self.name, self.table_code = info[0], info[1]
def __init__( self, keyword_list, is_save=True, url='http://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word={}' ): self.url = url self.is_save = is_save self.keyword_list = keyword_list self.extractor = GeneralNewsExtractor()
def __init__(self): # TODO 重构: https://new.qq.com/ch/finance/ super(qqStock, self).__init__() self.extractor = GeneralNewsExtractor() self.token = "8f6b50e1667f130c10f981309e1d8200" self.list_url = "https://pacaio.match.qq.com/irs/rcd?cid=52&token={}" \ "&ext=3911,3922,3923,3914,3913,3930,3915,3918,3908&callback=__jp1".format(self.token) self.fields = [] self.table_name = "qq_Astock_news" # self.name = '腾讯财经[A股]' info = utils.org_tablecode_map.get(self.table_name) self.name, self.table_code = info[0], info[1]
def fetch(url): headers = { "User-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36' } try: response = requests.get(url=url, headers=headers).content.decode( 'utf-8', "ignore") extractor = GeneralNewsExtractor() article_content = extractor.extract(response) article_content["url"] = url except: return None return article_content
def extract(task: ExtractTask): extractor = GeneralNewsExtractor() try: result = extractor.extract( task.html, title_xpath= task.title_xpath, author_xpath=task.author_xpath, publish_time_xpath=task.publish_time_xpath, with_body_html=task.with_body_html, host=task.host, noise_node_list=task.noise_node_list ) except Exception as e: result = {'success': False, 'msg': str(e)} return result