class GetPageContent(object): def __init__(self): self.headers = {} self.extractor = GeneralNewsExtractor() def static_page_dict(self, url: str) -> str: res = requests.get(url=url, headers=self.headers, timeout=15) detail_html = res.text return detail_html def extractor_html(self, detail_html): result = self.extractor.extract(detail_html) return result def extractor_html_code(self, detail_html): result = self.extractor.extract(detail_html, with_body_html=True) return result def extractor_html_abstract_path(self, detail_html, host=""): result = self.extractor.extract(detail_html, host=host) return result def extractor_html_noise_code(self, detail_html, title_xpath="", noise_node_list=""): result = self.extractor.extract(detail_html, title_xpath=title_xpath, noise_node_list=noise_node_list) return result
def parse_items(self, response): lyurl = response.url extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('.nrBt01::text').extract_first() txt = result['content'] publish_time = response.xpath( '//*[@id="ctl00_main_panel3"]/table[2]/tr/td[1]/text()' ).extract_first() time = get_times(publish_time) item = HyxhItem() content_css = ['.nrTxt02'] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '中国海洋工程咨询协会' item['website'] = '中国海洋工程咨询协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'china_ocean' item['module_name'] = '行业协会' yield item
def parse_item(self, response): ''' 实际解析页面根据页面实际情况进行解析 ''' item =XtyDataCollectItem() resp=response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp,with_body_html=False) title =result['title'] txt =result['title'] p_tiem =result['publish_time'] for conte in get_content_css(): try: content = response.css(conte).extact() if content: content = content break except: logging.warning('正文获取失败,请检查') appendix, appendix_name = get_attachments(response) item['title'] =title item['txt'] =txt item['p_tiem'] = get_times(p_tiem) item['content'] = content item['appendix']=appendix item['appendix_name']=appendix_name yield item
def parse(self, response): item = YanbItem() resp = response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp, with_body_html=False) title = result['title'] txt = result['content'] p_time = result['publish_time'] content_css = ['.body'] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') appendix, appendix_name = get_attachments(response) tags, _, _ = get_category(txt + title) industry = '' item['title'] = title item['p_time'] = get_times(str(p_time)) item['industry'] = industry item['appendix'] = appendix item['appendix_name'] = appendix_name item['content'] = ''.join(content) item['pub'] = '链塔' item['ctype'] = 3 item['website'] = '链塔' item['txt'] = ''.join(txt).strip() item['link'] = response.url item['spider_name'] = 'YB_LT' item['module_name'] = '研报' item['tags'] = tags if content: yield item
def parse_items(self, response): extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('h1::text').extract_first() txt = result['content'] publish_time = result['publish_time'] time = get_times(publish_time) item = HyxhItem() content_css = [ '.wof' ] lyurl = response.url for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '中国机械工业联合会' item['website'] = '中国机械工业联合会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'chinaFeature1' item['module_name'] = '行业协会' yield item
def get_novels_content(url, **kwargs): headers = { 'User-Agent': get_random_user_agent(), 'Referer': url } max_content = '' html, real_url, status = get_html_by_requests(headers=headers, url=url, **kwargs) netloc = get_netloc(real_url) if html: if netloc in RULES: soup = BeautifulSoup(html, 'html5lib') selector = RULES[netloc].content_selector if selector.get('id', None): content = soup.find_all(id=selector['id']) elif selector.get('class', None): content = soup.find_all(class_=selector['class']) else: content = soup.find_all(selector.get('tag')) if content: max_content = content[0].get_text() else: extractor = GeneralNewsExtractor() result = extractor.extract(html, with_body_html=True) max_content = result.get('content', '') for key in CONTENT_REPLACE: max_content = max_content.replace(key, CONTENT_REPLACE[key]) res_content = '\n'.join([i.strip() for i in max_content.split('\n') if i.strip()]) return res_content, status
def parse_item(self, response): item = HyNewsItem() resp = response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp, with_body_html=False) title = response.css('#zxwk_left_1 h2::text').extract_first() txt = result['content'] p_time = result['publish_time'] lyurl = response.url lyname = '石油在线' content_css = [ '#zxwk_left_1', ] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title item['txt'] = txt item['p_time'] = get_times(p_time) item['content'] = content item['spider_name'] = 'HY_SYZX' item['module_name'] = '石油在线' item['cate'] = '石油' item['region'] = '' item['code'] = '' item['link'] = lyurl item['website'] = lyname if content: yield item
def parse_item(self, response): item = HyNewsItem() resp = response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp, with_body_html=False) title = result['title'] txt = result['content'] p_time = result['publish_time'] lyurl = response.url lyname = '生意宝' content_css = [ '.zstexts', ] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') classify, codes, region = get_category(txt) item['title'] = title item['txt'] = txt item['p_time'] = get_times(p_time) item['content'] = content item['spider_name'] = 'HY_SYB' item['module_name'] = '行业新闻' item['cate'] = classify item['region'] = region item['code'] = codes item['link'] = lyurl item['website'] = lyname if content: yield item
def parse_items(self, response): lyurl = response.url extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('#title::text').extract_first() txt = result['content'] publish_time = response.css('#info::text').extract_first() time = get_times(publish_time) item = HyxhItem() content_css = [ '#maininfo' ] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '浙江省船舶行业协会' item['website'] = '浙江省船舶行业协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 2 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'zhejaing_Aship2' item['module_name'] = '行业协会' yield item
def parse_items(self, response): extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = result['title'] publish_time = result['publish_time'] time = get_times(publish_time) item = TRUCKItem() content_css = ['.details'] lyurl = response.url for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['source'] = '中国卡车网' item['website'] = '中国卡车网' item['link'] = lyurl item['type'] = 1 item['time'] = time item['content'] = content item['spider_name'] = 'ZGKCW' item['module_name'] = '产销库' yield item
def parse_from_html(html: str, url: str) -> dict: """根据 html 提取新闻内容, 该 url 仅用于返回(不做处理)""" extractor = GeneralNewsExtractor() try: result = extractor.extract(html, with_body_html=True) except Exception as e: raise Exception(f'Html parsing error, reason: {e}') result['url'] = url return result
def fetch(url): headers = { "User-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36' } try: response = requests.get(url=url, headers=headers).content.decode( 'utf-8', "ignore") extractor = GeneralNewsExtractor() article_content = extractor.extract(response) article_content["url"] = url except: return None return article_content
def extract(task: ExtractTask): extractor = GeneralNewsExtractor() try: result = extractor.extract( task.html, title_xpath= task.title_xpath, author_xpath=task.author_xpath, publish_time_xpath=task.publish_time_xpath, with_body_html=task.with_body_html, host=task.host, noise_node_list=task.noise_node_list ) except Exception as e: result = {'success': False, 'msg': str(e)} return result
class FK(Base): def __init__(self): super(FK, self).__init__() self.list_url = 'http://finance.takungpao.com/fk/' self.extractor = GeneralNewsExtractor() self.table = 'takungpao' self.name = '风口' self.fields = ['link', 'title', 'pub_date', 'article'] def _parse_detail(self, body): result = self.extractor.extract(body) content = result.get("content") return content def _start(self): resp = self.get(self.list_url) if resp: body = resp.text doc = html.fromstring(body) news_list = doc.xpath( '//div[@class="wrap-l js-list fl_dib"]/ul/li/div[@class="list-text fr_dib"]' ) # print(len(news_list)) items = [] for news in news_list: item = {} # print(news.text_content().split("\r\n")) title = news.xpath('./h1/a')[0].text_content() # print(title) item['title'] = title link = news.xpath('./h1/a/@href')[0] # print(link) item['link'] = link pub_date = news.xpath('./div[@class="date"]')[0].text_content() # print(pub_date) item['pub_date'] = pub_date detail_resp = self.get(link) if detail_resp: detail_page = detail_resp.text article = self._parse_detail(detail_page) if article: item['article'] = article print(item) items.append(item) self.save(items)
async def parse_new(self, id): browser = await launch(headless=False, args=['--disable-infobars']) page = await browser.newPage() ulls = self.url + id await page.goto(ulls) extractor = GeneralNewsExtractor() result = extractor.extract(await page.content()) intab = '?/|\.><:*"' title = result['title'] #如果有intab字符,系统会报错(windows里有特殊用法) for s in intab: if s in title: title = title.replace(s, '') contend = result['content'].split('来源')[0] path = 'E:\\biyejinri\\result\\' + title + '.txt' if not os.path.exists(path): self.save_data("result/" + title + ".txt", contend) self.sort_data(contend) await browser.close()
class MeituanArticleSpider(scrapy.Spider): runing = False name = "meituan_article" def __init__(self, name=None, urls=None, **kwargs): super().__init__(name, **kwargs) self.extractor = GeneralNewsExtractor() # self.urls = urls if urls is not None else ["https://tech.meituan.com/2013/12/04/yui3-practice.html"] if urls is None: raise ValueError("url 不能为空") self.urls = urls self.count = 1 MeituanArticleSpider.runing = True def start_requests(self): for url in self.urls: yield scrapy.Request(url, callback=self.parse) def parse(self, response: scrapy.http.response.Response): next_page = response.xpath( '//div[@class="navigation-wrapper"]/div/a[@class="next"]/@href' ).get() if next_page: print(next_page) self.count += 1 if self.count < 20: yield response.follow(next_page, callback=self.parse) desc = response.xpath('//meta[@name="description"]/@content').get() tags = response.xpath('//span[@class="tag-links"]/a/text()').getall() res = self.extractor.extract(response.text) yield MeituanArticleSpiderItem(url=response.url, title=res['title'], content=res['content'], tags=tags, author=res['author'], publish_time=res['publish_time']) @staticmethod def close(spider, reason): MeituanArticleSpider.runing = False return super().close(spider, reason)
def parse_item(self, response): ''' 解析文章内容 :param response: :return: 返回数据Item对象 ''' exetractor = GeneralNewsExtractor() # logging.info(response.text) newInfo = exetractor.extract(response.text, title_xpath='//h5/text') item = NewsItem() item['title'] = response.meta['title'] item['publish_date'] = newInfo['publish_time'] item['url'] = response.meta['url'] item['source_media'] = response.meta['source_media'] item['spread_media'] = '' item['content'] = newInfo['content'].replace('\n', '') item['content_html'] = '' item['spider_data_file'] = self.spiderDataFile yield item
def parse_items(self, response): lyurl = response.url if lyurl.find( 'http://jamia.org.cn/index.php?g=&m=contents&a=index&term_id=31&page=' ) < 0: extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('h2::text').extract_first() txt = result['content'] publish_time = result['publish_time'] time = get_times(publish_time) item = HyxhItem() content_css = [ '/html/body/div[1]/div[4]/table/tbody/tr/td[2]/table/tbody/tr[2]/td/table/tbody/tr[4]' ] for content in content_css: content = ''.join(response.xpath(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '江苏省新材料产业协会' item['website'] = '江苏省新材料产业协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'jiangsu' item['module_name'] = '行业协会' yield item
def parse_items(self, response): extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = result['title'] txt = result['content'] publish_time = result['publish_time'] time = get_times(publish_time) item = HyxhItem() print(response.url) content_css = [ '.MsoNormal', '#rightcol p', ] lyurl = response.url for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '深圳市医疗器械行业协会' item['website'] = '深圳市医疗器械行业协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'shenzhen' item['module_name'] = '行业协会' yield item
def main(): driver = webdriver.Chrome('/home/mi/tools/chromedriver') driver.get( 'https://mp.weixin.qq.com/s?__biz=MzUyNDM4NDc4NA==&mid=2247501581&idx=1&sn=f26c6238cae565804512fe637f55fca5&chksm=fa2ca872cd5b2164800820778d22305eb77bb91f2213cd9f7ecc97e33818a8ea9b07f01ec158&scene=21#wechat_redirect' ) time.sleep(3) extractor = GeneralNewsExtractor() result = extractor.extract(driver.page_source, with_body_html=True) print(result) # json.dumps(): 对数据进行编码。 # json.loads(): 对数据进行解码。 res = json.dumps(result) data = json.loads(res) title = data['title'] content = data['content'] images = data['images'] print("images = : " + str(len(images))) for image in images: print(image) content_list = content.split('\n') insert_factor = len(content_list) // len(images) print("insert_factor = : " + str(insert_factor)) print("content_list = : " + str(len(content_list))) write_title(title) dir_name = title.replace(' ', '') save_file_to_local(dir_name, images) for content in content_list: index = content_list.index(content) picture_index = index // insert_factor if index > 0 and index % insert_factor == 0 and picture_index < len( images): write_picture(str(picture_index)) print(content) write_paragraph(content) driver.quit() document.save('demo.docx')
def parse_items(self, response): extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.xpath( '/html/body/div[4]/div/div/div[2]/div[3]/div[1]/span/text()' ).extract_first() txt = result['content'] publish_time = ''.join( response.xpath( '/html/body/div[4]/div/div/div[2]/div[3]/div[2]/text()'). extract()) time = get_times(publish_time) item = HyxhItem() content_css = ['/html/body/div[4]/div/div/div[2]/div[3]/div[4]'] lyurl = response.url for content in content_css: content = ''.join(response.xpath(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '上海市生物医药协会' item['website'] = '上海市生物医药协会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'shanghai_sbia' item['module_name'] = '行业协会' yield item
def parse_items(self, response): lyurl = response.url extractor = GeneralNewsExtractor() resp = response.text result = extractor.extract(resp, with_body_html=False) title = response.css('h2::text').extract_first() txt = result['content'] publish_time = response.xpath( '//div[@class="main2 ma clear"]/span[@class="riqi_1"][1]/text()' ).extract_first() # publish_time = response.xpath('/html/body/div[4]/span[1]/text()').extract_first() print('publish_time:' + str(publish_time)) time = get_times(publish_time) item = HyxhItem() content_css = ['.para.ma'] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['source'] = '上海市船舶与海洋工程学会' item['website'] = '上海市船舶与海洋工程学会' item['link'] = lyurl item['appendix_name'] = appendix_name item['type'] = 1 item['tags'] = '' item['time'] = time item['content'] = content item['txt'] = txt item['spider_name'] = 'shanghai_Aship' item['module_name'] = '行业协会' yield item
class ZhongGuoJingJi(Base): def __init__(self): super(ZhongGuoJingJi, self).__init__() self.name = '中国经济' self.first_url = 'http://www.takungpao.com/finance/236132/index.html' self.format_url = 'http://www.takungpao.com/finance/236132/{}.html' self.page = 10 self.fields = ['pub_date', 'link', 'title', 'article'] self.table = 'takungpao' self.extractor = GeneralNewsExtractor() def _process_pub_dt(self, pub_date): # 对 pub_date 的各类时间格式进行统一 current_dt = datetime.datetime.now() yesterday_dt_str = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d") after_yesterday_dt_str = ( datetime.datetime.now() - datetime.timedelta(days=2)).strftime("%Y-%m-%d") if "小时前" in pub_date: # eg. 20小时前 hours = int(pub_date.replace('小时前', '')) pub_date = ( current_dt - datetime.timedelta(hours=hours)).strftime("%Y-%m-%d %H:%M:%S") elif "昨天" in pub_date: # eg. 昨天04:24 pub_date = pub_date.replace('昨天', '') pub_date = " ".join([yesterday_dt_str, pub_date]) elif '前天' in pub_date: # eg. 前天11:33 pub_date = pub_date.replace("前天", '') pub_date = " ".join([after_yesterday_dt_str, pub_date]) else: # eg. 02-29 04:24 pub_date = str(current_dt.year) + '-' + pub_date # print(pub_date) return pub_date def _parse_detail(self, link): detail_resp = self.get(link) if detail_resp: body = detail_resp.text result = self.extractor.extract(body) content = result.get("content") return content def _parse_list(self, list_url): items = [] list_resp = self.get(list_url) if list_resp: list_page = list_resp.text doc = html.fromstring(list_page) news_list = doc.xpath( '//div[@class="wrap_left"]/dl[@class="item clearfix"]') for news in news_list: item = {} link = news.xpath('./dd[@class="intro"]/a/@href')[0] # print(link) item['link'] = link title = news.xpath("./dd/a/@title")[0] # print(title) item['title'] = title pub_date = news.xpath("./dd[@class='sort']/text()")[0] pub_date = self._process_pub_dt(pub_date) # print(pub_date) item['pub_date'] = pub_date article = self._parse_detail(link) if article: article = self._process_content(article) item['article'] = article print(item) items.append(item) return items def _start(self): for page in range(1, self.page + 1): print("page >>>", page) if page == 1: list_url = self.first_url else: list_url = self.format_url.format(page) items = self._parse_list(list_url) self.save(items)
class BaseSpider(object): def __init__(self): # 请求头 self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } # 是否在本地运行 self.local = LOCAL # shodua self.use_js = True # selenium 的 Chrome 的相关配置 if not self.use_js: if self.local: self.browser = webdriver.Chrome() else: self._check_selenium_status() self.browser = webdriver.Remote( command_executor="http://chrome:4444/wd/hub", desired_capabilities=DesiredCapabilities.CHROME) self.browser.implicitly_wait(5) if self.local: conf = { "host": LOCAL_MYSQL_HOST, "port": LOCAL_MYSQL_PORT, "user": LOCAL_MYSQL_USER, "password": LOCAL_MYSQL_PASSWORD, "db": LOCAL_MYSQL_DB, } self.db = LOCAL_MYSQL_DB else: conf = { "host": MYSQL_HOST, "port": MYSQL_PORT, "user": MYSQL_USER, "password": MYSQL_PASSWORD, "db": MYSQL_DB, } self.db = MYSQL_DB self.sql_client = PyMysqlBase(**conf) self.extractor = GeneralNewsExtractor() self.error_list = [] self.error_detail = [] def _check_selenium_status(self): """ 检查 selenium 服务端的状态 """ while True: i = 0 try: resp = requests.get("http://chrome:4444/wd/hub/status", timeout=0.5) except: i += 1 if i > 10: raise else: logger.info(resp.text) break def __del__(self): try: self.browser.close() except: pass def fetch_page(self, url): if self.use_js: return self.js_get_page(url) retry = 2 try: self.browser.get(url) page = self.browser.page_source except: retry -= 1 if retry < 0: return print('Crawling Failed', url) print('try to fetch again') time.sleep(3) return self.fetch_page(url) else: return page def _get_refer_url(self, body): """获取重定向之后的网址""" doc = html.fromstring(body) script_content = doc.xpath("//script")[0].text_content() re_str = r"var(.+?).split" ret = re.findall(re_str, script_content)[0] # print("正则结果: ", ret) ret = ret.lstrip("|(") ret = ret.rstrip("')") ret_lst = ret.split("|") names = ret_lst[0::2] params = ret_lst[1::2] info = dict(zip(names, params)) factor = sum([ord(ch) for ch in info.get("wzwsquestion")]) * int( info.get("wzwsfactor")) + 0x1b207 raw = f'WZWS_CONFIRM_PREFIX_LABEL{factor}' refer_url = info.get( "dynamicurl") + '?wzwschallenge=' + base64.b64encode( raw.encode()).decode() return "http://www.pbc.gov.cn" + refer_url def js_get_page(self, url): s = requests.Session() h1 = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.pbc.gov.cn', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36', } resp1 = s.get(url, headers=h1) cookie1 = resp1.headers.get("Set-Cookie").split(";")[0] origin_text = resp1.text redirect_url = self._get_refer_url(origin_text) h1.update({ 'Cookie': cookie1, 'Referer': 'http://www.pbc.gov.cn/goutongjiaoliu/113456/113469/11040/index1.html', }) resp2 = s.get(redirect_url, headers=h1) text = resp2.text.encode("ISO-8859-1").decode("utf-8") return text def gne_parse_detail(self, page): result = self.extractor.extract(page) content = result.get("content") return content def contract_sql(self, to_insert): ks = [] vs = [] for k in to_insert: ks.append(k) vs.append(to_insert.get(k)) fields_str = "(" + ",".join(ks) + ")" values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")" base_sql = '''INSERT INTO `{}`.`{}` '''.format( self.db, self.table) + fields_str + ''' values ''' + values_str + ''';''' return base_sql, tuple(vs) def _process_item(self, item): return item # item.update({"article": self._process_content(item.get("article"))}) # return item def _process_content(self, vs): # 去除 4 字节的 utf-8 字符,否则插入mysql时会出错 try: # python UCS-4 build的处理方式 highpoints = re.compile(u'[\U00010000-\U0010ffff]') except re.error: # python UCS-2 build的处理方式 highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') params = list() for v in vs: # 对插入数据进行一些处理 nv = highpoints.sub(u'', v) nv = self._filter_char(nv) params.append(nv) return "".join(params) def _filter_char(self, test_str): # 处理特殊的空白字符 # '\u200b' 是 \xe2\x80\x8b for cha in [ '\n', '\r', '\t', '\u200a', '\u200b', '\u200c', '\u200d', '\u200e', '\u202a', '\u202b', '\u202c', '\u202d', '\u202e', ]: test_str = test_str.replace(cha, '') test_str = test_str.replace(u'\xa0', u' ') # 把 \xa0 替换成普通的空格 return test_str def save(self, item): to_insert = self._process_item(item) insert_sql, values = self.contract_sql(to_insert) try: ret = self.sql_client.insert(insert_sql, values) except pymysql.err.IntegrityError: logger.warning("重复", to_insert) return 1 except: traceback.print_exc() else: return ret def _get_page_url(self, page_num): if self.start_url: return self.start_url.format(page_num) elif page_num == 1: return self.first_url else: return self.format_url.format(page_num) def process_list(self, page_num): page_url = self._get_page_url(page_num) list_retry = 2 while True: try: list_page = self.fetch_page(page_url) if list_page: items = self._parse_list_page(list_page) else: raise except: print("list page {} retry ".format(page_num)) list_retry -= 1 if list_retry < 0: return # self.process_list(page_num) else: return items def process_detail(self, link): detail_retry = 2 while True: try: detail_page = self.fetch_page(link) if detail_page: article = self._parse_detail_page(detail_page) else: raise except: print("detail page {} retry ".format(link)) detail_retry -= 1 if detail_retry < 0: return else: return article def _start(self, page_num): print("page num is {}\n".format(page_num)) items = self.process_list(page_num) print(items) if items: for item in items: link = item["link"] article = self.process_detail(link) if article: item['article'] = article # print(item) ret = self.save(item) if not ret: self.error_detail.append(item.get("link")) else: self.error_detail.append(link) else: self.error_list.append(self._get_page_url(page_num))
import json import glob from gne import GeneralNewsExtractor if __name__ == '__main__': html_list = glob.glob('**/*/*.html', recursive=True) for html_file in html_list: with open(html_file, encoding='utf-8') as f: html = f.read() extractor = GeneralNewsExtractor() result = extractor.extract(html, host='https://www.xxx.com', noise_node_list=[ '//div[@class="comment-list"]', '//*[@style="display:none"]', ]) print(f'>>>>>>>>>>>>>{html_file}>>>>>>>>>>>>>') print(json.dumps(result, indent=2, ensure_ascii=False)) print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
def get_data_dict(news_url): extractor = GeneralNewsExtractor() html = get_response_text(news_url) result = extractor.extract(html) return result
class CN4Hours(SpiderBase): # 适用于 上证四小时 def __init__(self): super(CN4Hours, self).__init__() self.list_url = "http://app.cnstock.com/api/theme/get_theme_list?" self.extractor = GeneralNewsExtractor() self.table_name = "cn_stock" self.name = '上证四小时' self.fields = ['pub_date', 'title', 'link', 'article'] def _create_table(self): self._spider_init() sql = ''' CREATE TABLE IF NOT EXISTS `{}` ( `id` int(11) NOT NULL AUTO_INCREMENT, `pub_date` datetime NOT NULL COMMENT '发布时间', `title` varchar(64) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '文章标题', `link` varchar(128) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '文章详情页链接', `article` text CHARACTER SET utf8 COLLATE utf8_bin COMMENT '详情页内容', `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP, `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY (`id`), UNIQUE KEY `link` (`link`), KEY `pub_date` (`pub_date`), KEY `update_time` (`UPDATETIMEJZ`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='上海证券报'; '''.format(self.table_name) self.spider_client.insert(sql) self.spider_client.end() def make_query_params(self, latest_id): """ 拼接动态请求参数 """ query_params = { 'maxid': str(0), 'minid': str(latest_id), # 这个越大的是越新的内容 'size': 5, 'callback': 'jQuery{}_{}'.format( ''.join(random.choice(string.digits) for i in range(0, 20)), str(int(time.time() * 1000))), '_': str(int(time.time() * 1000)), } return query_params def get_zhaiyao(self, url): try: page = req.get(url, headers=self.headers).text doc = html.fromstring(page) detail_link = doc.xpath("//div[@class='tcbhd-r']//h1/a/@href")[0] return detail_link except: return None def get_detail(self, detail_url): try: page = req.get(detail_url, headers=self.headers).text result = self.extractor.extract(page) content = result.get("content") return content except: return None def get_count(self): params = self.make_query_params(0) url = self.list_url + urlencode(params) ret = req.get(url, headers=self.headers).text json_data = re.findall(r'jQuery\d{20}_\d{13}\((\{.*?\})\)', ret)[0] py_data = json.loads(json_data) count = py_data.get("item")[0].get("order") return count + 1 def get_list(self): count = self.get_count() print("网页个数: ", count) items = [] for latest_id in range(count, 0, -5): params = self.make_query_params(latest_id) url = self.list_url + urlencode(params) ret = req.get(url, headers=self.headers).text logger.info(ret) json_data = re.findall(r'jQuery\d{20}_\d{13}\((\{.*?\})\)', ret)[0] py_data = json.loads(json_data) datas = py_data.get("item") if not datas: break for one in datas: item = dict() item['pub_date'] = one.get("datetime") item['title'] = one.get("title") item[ 'zhaiyao'] = 'http://news.cnstock.com/theme,{}.html'.format( one.get("id")) items.append(item) return items def start(self): self._create_table() self._spider_init() items = self.get_list() nitems = [] for item in items: zhaiyao_link = item.get('zhaiyao') detail_url = self.get_zhaiyao(zhaiyao_link) if detail_url: item['link'] = detail_url item['article'] = self.get_detail(detail_url) item.pop("zhaiyao") print(item) nitems.append(item) print("数据量 : ", len(nitems)) ret = self._batch_save(self.spider_client, nitems, self.table_name, self.fields) print("插入个数: ", ret)
#!/usr/bin/python3 # -*- coding:utf-8 -*- """ @author: lms @file: gne.py @time: 2020/2/27 21:50 @desc: """ from gne import GeneralNewsExtractor # news_url: https://news.163.com/20/0222/19/F610K69R00019B3E.html html = open('news.html', 'r+', encoding='utf-8').read() extractor = GeneralNewsExtractor() result = extractor.extract(html, with_body_html=True) # res_json = json.dumps(result, ensure_ascii=False) for k, v in result.items(): print(k, v)
import os from gne import GeneralNewsExtractor filename = [] fp = open("/home/ianliu/develope/maintex_Extraction/mytest.txt", "r") lines = fp.readlines() for line in lines: line = line.split("\n")[0] filename.append(line) extractor = GeneralNewsExtractor() for f in filename: html = open(f, "r") print(f) result = extractor.extract(html.read()) '''print(f.split(".")[0].split("/") )''' wf = open("/home/ianliu/develope/maintex_Extraction/python/output/" + f.split(".")[0].split("/")[8] + ".txt", "w") wf.write(result['content']) wf.close() html.close()
class Reference(SpiderBase): def __init__(self): super(Reference, self).__init__() self.index_url = 'http://www.jfinfo.com/reference' self.more_url = 'http://www.jfinfo.com/articles_categories/more?page={}&category_id=13' self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Cookie': 'Hm_lvt_eb1542e7fa53709d9037dcc8c652e026=1583204516; Hm_lpvt_eb1542e7fa53709d9037dcc8c652e026=1583205739; _jfinfo_session=SzdiRTlIeUw5QXdObkRSNG5kUGpVRDNCQld3NGVkcTcrWnVNR3dZdTA4TWxoRVd3VENkQlBTeHcxQkdGaS9nUG9qdDVEeFlqMEI1OFdQMmdYNXJLTyt0YzJjRkRVbEVKa25YOUQvWUl5RjZFTm5WbENuN1JLZ05RSFR4cEVYVW90alhpSGNHSldiYWlZMDNXR0NuK293PT0tLWJwd2UybVpjREltRHB1bUxMdUxBZ2c9PQ%3D%3D--4ef0e46e0b2629bbf61194ceefd60e8b6b398499', 'Host': 'www.jfinfo.com', 'Pragma': 'no-cache', 'Referer': 'http://www.jfinfo.com/reference', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36', 'X-CSRF-Token': '9Kgn0ZaNQJWoqIht/pDIK9h97D5wuSFQ4gbSV8eB3eeXm3BVKjz7g8kDflyf0G4LssxDAOa0J297e6x5aKPndQ==', 'X-Requested-With': 'XMLHttpRequest', } self.extractor = GeneralNewsExtractor() self.table_name = 'jfinfo' # 巨丰资讯 self.fields = ['link', 'title', 'pub_date', 'article'] self.max_page = 2 self.name = '巨丰内参' def get(self, url): return requests.get(url, headers=self.headers) def _create_table(self): self._spider_init() sql = ''' CREATE TABLE IF NOT EXISTS `{}`( `id` int(11) NOT NULL AUTO_INCREMENT, `pub_date` datetime NOT NULL COMMENT '发布时间', `title` varchar(64) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '文章标题', `link` varchar(128) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '文章详情页链接', `article` text CHARACTER SET utf8 COLLATE utf8_bin COMMENT '详情页内容', `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP, `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY (`id`), UNIQUE KEY `link` (`link`), KEY `pub_date` (`pub_date`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='巨丰财经'; '''.format(self.table_name) self.spider_client.insert(sql) self.spider_client.end() def _parse_detail(self, body): result = self.extractor.extract(body) content = result.get("content") return content def _parse_index(self, index_page): doc = html.fromstring(index_page) news_list = doc.xpath("//div[@class='m-contentl left']//dl") items = [] for news in news_list: item = {} title = news.xpath(".//a[@class='f20']/text()")[0] item['title'] = title link = news.xpath(".//a[@class='f20']/@href")[0] item['link'] = link _year = None try: _year = re.findall(r"news/(\d+)/", link)[0][:4] # 20161218 except: pass pub_date = news.xpath(".//dd/span/text()")[0] pub_date = self._process_pub_dt(pub_date, _year) item['pub_date'] = pub_date detail_resp = self.get(link) if detail_resp: detail_page = detail_resp.text article = self._parse_detail(detail_page) item['article'] = article items.append(item) # print(item) return items def _parse_more(self, more_page): ''' if(!$("#bottom_load_error").data("block")){ $("#page_num").val("2"); $(".m-newsYaow").append('<div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2765134\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3153843/cfb2d9fa-0cbc-4a0e-99cb-be9232aad421.jpg\" alt=\"Cfb2d9fa 0cbc 4a0e 99cb be9232aad421\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2765134\">巨·个股 | 市场规模940亿!两大核心龙头已被保险巨头看中,互联网医疗大机遇<\/a>\n <p class=\"cGray\">字字真言的选股、操作理念;价值千金的市场热点、龙头捕捉技法!应有尽有。学会炒股、洞悉热点。捕捉龙头,一份“巨·个股”就够了!导读:互联网医疗行业景气度大大提升,产业链哪些个股受益,还有没有参与的...<\/p>\n <\/dt>\n <dd><i>巨丰日刊<\/i><span>昨天15:30<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2764986\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3153695/b8bb5b8d-fc85-4c43-b607-a11c3960bd04.jpg\" alt=\"B8bb5b8d fc85 4c43 b607 a11c3960bd04\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2764986\">巨丰数据赢|百强席位操盘手法解密(一百四十七):同样是打板 有些席位就要回避<\/a>\n <p class=\"cGray\">【巨丰投顾】大数据研发团队依托四大数据库,针对市场百强龙虎榜营业部全面进行数据回测分析,帮助分析师、投资顾问、专业投资者解析各路一线游资操盘手法,助您看清对手“底牌”,数据回测告诉我们,一旦一线...<\/p>\n <\/dt>\n <dd><i>百强席位<\/i><span>昨天15:19<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2764983\"><img src=\"https://asset3.tougub.com/assets/default/article/cover/77-7c3d31472851be786c76384e4d30292eb77133f949a42fc573c212f8901240fc.jpg\" alt=\"77\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2764983\">巨·财经 | A股“宏观因子指标”探明底部?<\/a>\n <p class=\"cGray\">追踪最新鲜的财经事件,探寻热点背后的投资机会。巨丰投顾“巨·财经”为您专业解读财经事件背后的投资秘密。导读:2月财新中国制造业PMI降至40.3,为有数据以来最低,A股“宏观因子指标”探明底部?...<\/p>\n <\/dt>\n <dd><i>巨丰日刊<\/i><span>昨天15:00<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2764979\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3153686/91349942-0a8b-48be-8eb0-f0efd797dc1b.png\" alt=\"91349942 0a8b 48be 8eb0 f0efd797dc1b\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2764979\">头号研报:中长期贷款增加 大基建投资提速带来结构性机会<\/a>\n <p class=\"cGray\">“研报也要做头号”——巨丰投顾最新栏目“头号研报”正式上线。在众多研报之中,我们通过层层对比和筛选分析,经过提炼、加工,每周精选3-6篇有质量的文章,以“带你读研报”为目的,力争通过研报学习,挖掘市场投资机。\n<\/p>\n <\/dt>\n <dd><i>头号研报<\/i><span>昨天13:26<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2764261\"><img src=\"https://asset3.tougub.com/assets/default/article/cover/12-501b4dfb44d6a1942b6c9ea4df1253500c9b3b3f9f58291e4821d791a3464af8.jpg\" alt=\"12\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2764261\">巨丰投顾独家解读:非洲猪瘟疫苗创制成功<\/a>\n <p class=\"cGray\">事件:中国农业科学院哈尔滨兽医研究所在《中国科学:生命科学》英文版在线发表研究论文,报道了一株人工缺失七个基因的非洲猪瘟弱毒活疫苗对家猪具有良好的安全性和有效性。巨丰投顾指出,非洲猪瘟疫苗创制成...<\/p>\n <\/dt>\n <dd><i>独家解读<\/i><span>昨天10:58<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2764260\"><img src=\"https://asset2.tougub.com/assets/default/article/cover/63-d2a497ff78220446101ea781908b645df896ec412cae8f785a4a517c5dd5cd1e.jpg\" alt=\"63\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2764260\">巨丰投顾独家解读:沪指涨逾2% 水泥等板块指数涨逾5%<\/a>\n <p class=\"cGray\">事件:沪指涨逾2%,创业板指涨近1.2%,水泥、医废处理、建筑装饰等板块指数涨逾5%。巨丰投顾指出,基建板块大涨主要源于疫情对经济冲击之下基建稳增长预期。最新PMI数据显示2月财新制造业PMI降...<\/p>\n <\/dt>\n <dd><i>独家解读<\/i><span>昨天10:55<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763710\"><img src=\"https://asset3.tougub.com/assets/default/article/cover/12-501b4dfb44d6a1942b6c9ea4df1253500c9b3b3f9f58291e4821d791a3464af8.jpg\" alt=\"12\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763710\">巨丰投顾独家解读:2月财新中国制造业PMI40.3 为有数据以来最低 <\/a>\n <p class=\"cGray\">事件:2月财新中国制造业PMI降至40.3 为有数据以来最低。巨丰投顾指出,各行业受影响程度存在差异,保证民生需求的农副食品加工、食品及酒饮料精制茶等行业PMI明显高于制造业整体水平。为减少疫情...<\/p>\n <\/dt>\n <dd><i>独家解读<\/i><span>昨天10:15<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763709\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3152414/e1fa1fb0-8f1e-401b-96a6-4ccff235a1f8.jpg\" alt=\"E1fa1fb0 8f1e 401b 96a6 4ccff235a1f8\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763709\">外盘头条:国际油价跌跌不休?得看OPEC出不出手<\/a>\n <p class=\"cGray\">全球财经媒体周末共同关注的头条新闻主要有:1、特朗普政府考虑减税措施 施压美联储增加降息可能2、央行研究人员:全球经济“V”型复苏看起来“非常不现实”3、美银美林:衰退忧虑升级 投资者逃离股市转...<\/p>\n <\/dt>\n <dd><i>海外观察<\/i><span>昨天09:46<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763462\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3152166/884186f3-5c1d-47ae-af01-15ff367b365d.jpg\" alt=\"884186f3 5c1d 47ae af01 15ff367b365d\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763462\">大参考:美股半导体板块引领市场 全球半导体出货有望再超1万亿件<\/a>\n <p class=\"cGray\">今日导读1、新证券法3月1日起正式实施。3月1日起公司债券公开发行实行注册制,沪深交易所表示,加快制定公司债券实施注册制配套规则。2、截至北京时间3月1日20时,中国以外共61个国家和地区报告新...<\/p>\n <\/dt>\n <dd><i>大参考<\/i><span>昨天09:21<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763461\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3152165/7448a268-08b4-494e-9de2-dc579c2d48a5.jpg\" alt=\"7448a268 08b4 494e 9de2 dc579c2d48a5\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763461\">巨丰数据赢|北上资金持仓曝光 上周主力加仓个股出炉<\/a>\n <p class=\"cGray\">北上资金增持个股数量上周五小幅回升。其中上海主板个股被北上资金减持占比最大。从板块方面看,建筑装饰和农业板块被资金小幅增持。而轻工制造和家电板块被资金减持力度较为大,需注意。 从上一周数据统计看...<\/p>\n <\/dt>\n <dd><i>巨丰数据赢<\/i><span>昨天09:19<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763458\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3152161/d9d8aaf9-450b-40be-a450-5904578f2b58.jpg\" alt=\"D9d8aaf9 450b 40be a450 5904578f2b58\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763458\">巨丰数据赢|北上资金上周流出 主力却逆市买入这些股<\/a>\n <p class=\"cGray\">北上资金上周五流出24.26亿元(沪深股通使用额度),昨日成交净额(买入额-卖出额)约为-51.37亿元,继续以流出为主。从沪深港股通十大活跃股表现看,被北上资金买入金额靠前的个股主要有闻泰科技...<\/p>\n <\/dt>\n <dd><i>巨丰数据赢<\/i><span>昨天09:14<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763453\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3152155/a77bf6ac-3537-42e2-b439-4e635fb20bb0.jpg\" alt=\"A77bf6ac 3537 42e2 b439 4e635fb20bb0\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763453\">金股预测早间版:8股有望开启估值修复<\/a>\n <p class=\"cGray\">根据A股前一交易日的市场表现,以及沪深交易所的公告信息,财务报表以及市场热点等多方面内容,巨丰投顾甄选出近期市场强势热门股,以供投资者参考。公告掘金1、三夫户外(002780):子公司收购得清纳...<\/p>\n <\/dt>\n <dd><i>金股早间版<\/i><span>昨天08:38<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763233\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3151933/a2ca6466-ebe7-4b02-a6bc-eb6bc3893b5b.jpg\" alt=\"A2ca6466 ebe7 4b02 a6bc eb6bc3893b5b\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763233\">巨丰早参:新证券法实施首日 “债券注册制”落地<\/a>\n <p class=\"cGray\">巨丰今日策略巨丰投顾认为经过连续上涨,创业板春节后已经走出一波技术性牛市。政策利好以及流动性是推动市场不断上涨的主要因素。市场连续上涨后积累了大量的获利筹码,尤其是创业板短线涨幅接近30%,有调...<\/p>\n <\/dt>\n <dd><i>巨丰早参<\/i><span>昨天07:35<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200301/2762886\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3151586/ce4a8769-37e4-4693-ae91-c8528732b257.jpg\" alt=\"Ce4a8769 37e4 4693 ae91 c8528732b257\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200301/2762886\">财闻点金:多地加快互联网医院建设 供需两旺推动行业爆发<\/a>\n <p class=\"cGray\">要闻精选1.国务院办公厅近日发布通知,明确在不同市场和板块分步骤实施股票公开发行注册制。相关板块/市场注册制改革正式落地前,仍继续实施核准制。2.3月1日,发改委、证监会、沪深交易所等部门均发布...<\/p>\n <\/dt>\n <dd><i>财闻点金<\/i><span>03-01 22:15<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200301/2762681\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3151381/8aba9c10-3af6-4374-b968-d7b1af196536.jpeg\" alt=\"8aba9c10 3af6 4374 b968 d7b1af196536\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200301/2762681\">3月1日晚间上市公司十大重磅公告<\/a>\n <p class=\"cGray\">3月1日晚间,沪深两市多家上市公司发布重要公告:晨光生物回购资金总额上调为不低于2亿元且不超4亿元;格力电器拟注册发行债务融资工具 额度合计不超180亿元;傲农生物2月生猪销售量7.34万头 销售量环比增长77.04%。\n\n<\/p>\n <\/dt>\n <dd><i>晚间十大公告<\/i><span>03-01 19:03<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200229/2761494\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3150192/a3f8aee7-2933-4021-8b88-7a07aa82cf47.jpg\" alt=\"A3f8aee7 2933 4021 8b88 7a07aa82cf47\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200229/2761494\">巨丰访谈:外围市场重创 下周谁将成为人气王?<\/a>\n <p class=\"cGray\">本周外围市场重创,A股冲高回落,而市场交投热情爆表,连续8天日成交额超万亿,下周3月开局会如何演绎?央行副行长表示对普惠金融服务达标的银行择机定向降准,定向降准对市场将产生何种影响?.........<\/p>\n <\/dt>\n <dd><i>巨丰访谈<\/i><span>02-29 09:04<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200228/2760430\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3149128/4f1c3c57-79d0-44bf-b473-5f7a1848ae4a.png\" alt=\"4f1c3c57 79d0 44bf b473 5f7a1848ae4a\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200228/2760430\">云丰晚报:中国经济总量逼近100万亿大关<\/a>\n <p class=\"cGray\"><\/p>\n <\/dt>\n <dd><i>港股资讯<\/i><span>02-28 20:01<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200228/2760428\"><img src=\"https://asset2.tougub.com/assets/default/article/cover/51-3270b5ba9d90b46ff3c1df0e21b0cf4ca7c6e9e9a5c32580e39725a8b691bd73.jpg\" alt=\"51\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200228/2760428\">2月28日晚间上市公司十大重磅公告<\/a>\n <p class=\"cGray\">2月28日晚间,沪深两市多家上市公司发布重要公告:万集科技受益ETC建设,2019年净利同比增逾125倍;音飞储存控股股东筹划股权转让事项,或导致公司控制权变更;新宙邦2月11日起陆续复工,目前...<\/p>\n <\/dt>\n <dd><i>晚间十大公告<\/i><span>02-28 19:21<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200228/2760426\"><img src=\"https://asset2.tougub.com/assets/default/article/cover/41-f71e7e8f2419e4a1b6634dfcbbd11c4b654f914a3ffa78d616bf0f48bc140e5c.jpg\" alt=\"41\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200228/2760426\">新的导火索出现!布油跌破50美元关口,欧股暴跌逾4%<\/a>\n <p class=\"cGray\">周五午后,欧洲股市、贵金属和原油加速下跌。全球疫情升级、土俄方面的新消息都在加剧市场的恐慌情绪。CBOE恐慌指数VIX日内涨幅逾20%,与欧债危机时期相当,已高出2015和2018年抛售期间的高...<\/p>\n <\/dt>\n <dd><i>海外观察<\/i><span>02-28 18:15<\/span><\/dd>\n <\/dl>\n<\/div><div class=\"slide\">\n <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200228/2760425\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3149123/5a8ca9e4-d530-4539-b261-e92c627ccf7c.png\" alt=\"5a8ca9e4 d530 4539 b261 e92c627ccf7c\" /><\/a><\/div>\n <dl>\n <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200228/2760425\">金股预测晚间版:南卫股份等3股后市备受关注<\/a>\n <p class=\"cGray\">南卫股份(603880) 技术突破 ★★★投资要点:江苏南方卫材医药股份有限公司主要从事透皮产品、医用胶布胶带及绷带、运动保护产品、急救包、护理产品等产品的研发、生产和销售。目前已形成创可贴、贴...<\/p>\n <\/dt>\n <dd><i>金股晚间版<\/i><span>02-28 17:57<\/span><\/dd>\n <\/dl>\n<\/div>'); $("#bottom_load").data("value", true); } ''' append_datas = eval(re.findall(r"append\((.*?)\);", more_page)[0]) doc = html.fromstring(append_datas) news_list = doc.xpath(".//div[@class='slide']") items = [] for news in news_list: item = {} title = news.xpath(".//a[@class='f20']/text()")[0].strip() item['title'] = title link = news.xpath(".//a[@class='f20']/@href")[0] item['link'] = link # 根据规律从 link 中获取当前的年份 _year = None try: _year = re.findall(r"news/(\d+)/", link)[0][:4] # 20161218 except: pass pub_date = news.xpath(".//span/text()")[0].strip() pub_date = self._process_pub_dt(pub_date, _year) item['pub_date'] = pub_date detail_resp = self.get(link) if detail_resp: detail_page = detail_resp.text article = self._parse_detail(detail_page) item['article'] = article items.append(item) # print(item) return items def start(self): self._spider_init() self._create_table() index_resp = self.get(self.index_url) if index_resp and index_resp.status_code == 200: index_page = index_resp.text index_items = self._parse_index(index_page) page_save_num = self._batch_save(self.spider_client, index_items, self.table_name, self.fields) logger.info(f"首页入库的个数是 {page_save_num}") for num in range(1, self.max_page + 1): more_url = self.more_url.format(num) more_resp = self.get(more_url) if more_resp and more_resp.status_code == 200: more_page = more_resp.text items = self._parse_more(more_page) page_save_num = self._batch_save(self.spider_client, items, self.table_name, self.fields) logger.info(f"当前页 {num} 入库的个数是 {page_save_num}")