예제 #1
0
class GetPageContent(object):
    def __init__(self):
        self.headers = {}
        self.extractor = GeneralNewsExtractor()

    def static_page_dict(self, url: str) -> str:
        res = requests.get(url=url, headers=self.headers, timeout=15)
        detail_html = res.text
        return detail_html

    def extractor_html(self, detail_html):
        result = self.extractor.extract(detail_html)
        return result

    def extractor_html_code(self, detail_html):
        result = self.extractor.extract(detail_html, with_body_html=True)
        return result

    def extractor_html_abstract_path(self, detail_html, host=""):
        result = self.extractor.extract(detail_html, host=host)
        return result

    def extractor_html_noise_code(self,
                                  detail_html,
                                  title_xpath="",
                                  noise_node_list=""):
        result = self.extractor.extract(detail_html,
                                        title_xpath=title_xpath,
                                        noise_node_list=noise_node_list)
        return result
예제 #2
0
 def parse_items(self, response):
     lyurl = response.url
     extractor = GeneralNewsExtractor()
     resp = response.text
     result = extractor.extract(resp, with_body_html=False)
     title = response.css('.nrBt01::text').extract_first()
     txt = result['content']
     publish_time = response.xpath(
         '//*[@id="ctl00_main_panel3"]/table[2]/tr/td[1]/text()'
     ).extract_first()
     time = get_times(publish_time)
     item = HyxhItem()
     content_css = ['.nrTxt02']
     for content in content_css:
         content = ''.join(response.css(content).extract())
         if content:
             break
         if not content:
             logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
     item['title'] = title
     appendix, appendix_name = get_attachments(response)
     item['appendix'] = appendix
     item['source'] = '中国海洋工程咨询协会'
     item['website'] = '中国海洋工程咨询协会'
     item['link'] = lyurl
     item['appendix_name'] = appendix_name
     item['type'] = 1
     item['tags'] = ''
     item['time'] = time
     item['content'] = content
     item['txt'] = txt
     item['spider_name'] = 'china_ocean'
     item['module_name'] = '行业协会'
     yield item
예제 #3
0
 def parse_item(self, response):
     '''
     实际解析页面根据页面实际情况进行解析
     '''
     item =XtyDataCollectItem()
     resp=response.text
     extractor = GeneralNewsExtractor()
     result = extractor.extract(resp,with_body_html=False)
     title =result['title']
     txt =result['title']
     p_tiem =result['publish_time']
     for conte in get_content_css():
         try:
             content = response.css(conte).extact()
             if content:
                 content = content
                 break
         except:
             logging.warning('正文获取失败,请检查')
     appendix, appendix_name = get_attachments(response)
     item['title'] =title
     item['txt'] =txt
     item['p_tiem'] = get_times(p_tiem)
     item['content'] = content
     item['appendix']=appendix
     item['appendix_name']=appendix_name
     yield item
예제 #4
0
 def parse(self, response):
     item = YanbItem()
     resp = response.text
     extractor = GeneralNewsExtractor()
     result = extractor.extract(resp, with_body_html=False)
     title = result['title']
     txt = result['content']
     p_time = result['publish_time']
     content_css = ['.body']
     for content in content_css:
         content = ''.join(response.css(content).extract())
         if content:
             break
         if not content:
             logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
     appendix, appendix_name = get_attachments(response)
     tags, _, _ = get_category(txt + title)
     industry = ''
     item['title'] = title
     item['p_time'] = get_times(str(p_time))
     item['industry'] = industry
     item['appendix'] = appendix
     item['appendix_name'] = appendix_name
     item['content'] = ''.join(content)
     item['pub'] = '链塔'
     item['ctype'] = 3
     item['website'] = '链塔'
     item['txt'] = ''.join(txt).strip()
     item['link'] = response.url
     item['spider_name'] = 'YB_LT'
     item['module_name'] = '研报'
     item['tags'] = tags
     if content:
         yield item
예제 #5
0
    def parse_items(self, response):
        extractor = GeneralNewsExtractor()
        resp = response.text
        result = extractor.extract(resp, with_body_html=False)

        title = response.css('h1::text').extract_first()
        txt = result['content']
        publish_time = result['publish_time']
        time = get_times(publish_time)
        item = HyxhItem()
        content_css = [
            '.wof'
        ]
        lyurl = response.url
        for content in content_css:
            content = ''.join(response.css(content).extract())
            if content:
                break
            if not content:
                logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
        item['title'] = title
        appendix, appendix_name = get_attachments(response)
        item['appendix'] = appendix
        item['source'] = '中国机械工业联合会'
        item['website'] =  '中国机械工业联合会'
        item['link'] = lyurl
        item['appendix_name'] = appendix_name
        item['type'] = 1
        item['tags'] = ''
        item['time'] = time
        item['content'] = content
        item['txt'] = txt
        item['spider_name'] = 'chinaFeature1'
        item['module_name'] = '行业协会'
        yield item
예제 #6
0
def get_novels_content(url, **kwargs):
    headers = {
        'User-Agent': get_random_user_agent(),
        'Referer': url
    }

    max_content = ''
    html, real_url, status = get_html_by_requests(headers=headers, url=url, **kwargs)
    netloc = get_netloc(real_url)
    if html:
        if netloc in RULES:
            soup = BeautifulSoup(html, 'html5lib')
            selector = RULES[netloc].content_selector
            if selector.get('id', None):
                content = soup.find_all(id=selector['id'])
            elif selector.get('class', None):
                content = soup.find_all(class_=selector['class'])
            else:
                content = soup.find_all(selector.get('tag'))
            if content:
                max_content = content[0].get_text()
        else:
            extractor = GeneralNewsExtractor()
            result = extractor.extract(html, with_body_html=True)
            max_content = result.get('content', '')
        for key in CONTENT_REPLACE:
            max_content = max_content.replace(key, CONTENT_REPLACE[key])
    res_content = '\n'.join([i.strip() for i in max_content.split('\n') if i.strip()])
    return res_content, status
예제 #7
0
 def parse_item(self, response):
     item = HyNewsItem()
     resp = response.text
     extractor = GeneralNewsExtractor()
     result = extractor.extract(resp, with_body_html=False)
     title = response.css('#zxwk_left_1 h2::text').extract_first()
     txt = result['content']
     p_time = result['publish_time']
     lyurl = response.url
     lyname = '石油在线'
     content_css = [
         '#zxwk_left_1',
     ]
     for content in content_css:
         content = ''.join(response.css(content).extract())
         if content:
             break
         if not content:
             logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
     item['title'] = title
     item['txt'] = txt
     item['p_time'] = get_times(p_time)
     item['content'] = content
     item['spider_name'] = 'HY_SYZX'
     item['module_name'] = '石油在线'
     item['cate'] = '石油'
     item['region'] = ''
     item['code'] = ''
     item['link'] = lyurl
     item['website'] = lyname
     if content:
         yield item
예제 #8
0
 def parse_item(self, response):
     item = HyNewsItem()
     resp = response.text
     extractor = GeneralNewsExtractor()
     result = extractor.extract(resp, with_body_html=False)
     title = result['title']
     txt = result['content']
     p_time = result['publish_time']
     lyurl = response.url
     lyname = '生意宝'
     content_css = [
         '.zstexts',
     ]
     for content in content_css:
         content = ''.join(response.css(content).extract())
         if content:
             break
         if not content:
             logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
     classify, codes, region = get_category(txt)
     item['title'] = title
     item['txt'] = txt
     item['p_time'] = get_times(p_time)
     item['content'] = content
     item['spider_name'] = 'HY_SYB'
     item['module_name'] = '行业新闻'
     item['cate'] = classify
     item['region'] = region
     item['code'] = codes
     item['link'] = lyurl
     item['website'] = lyname
     if content:
         yield item
예제 #9
0
 def parse_items(self, response):
     lyurl = response.url
     extractor = GeneralNewsExtractor()
     resp = response.text
     result = extractor.extract(resp, with_body_html=False)
     title = response.css('#title::text').extract_first()
     txt = result['content']
     publish_time = response.css('#info::text').extract_first()
     time = get_times(publish_time)
     item = HyxhItem()
     content_css = [
         '#maininfo'
     ]
     for content in content_css:
         content = ''.join(response.css(content).extract())
         if content:
             break
         if not content:
             logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
     item['title'] = title
     appendix, appendix_name = get_attachments(response)
     item['appendix'] = appendix
     item['source'] = '浙江省船舶行业协会'
     item['website'] = '浙江省船舶行业协会'
     item['link'] = lyurl
     item['appendix_name'] = appendix_name
     item['type'] = 2
     item['tags'] = ''
     item['time'] = time
     item['content'] = content
     item['txt'] = txt
     item['spider_name'] = 'zhejaing_Aship2'
     item['module_name'] = '行业协会'
     yield item
예제 #10
0
    def parse_items(self, response):
        extractor = GeneralNewsExtractor()
        resp = response.text
        result = extractor.extract(resp, with_body_html=False)

        title = result['title']
        publish_time = result['publish_time']
        time = get_times(publish_time)
        item = TRUCKItem()
        content_css = ['.details']
        lyurl = response.url
        for content in content_css:
            content = ''.join(response.css(content).extract())
            if content:
                break
            if not content:
                logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
        item['title'] = title
        appendix, appendix_name = get_attachments(response)
        item['source'] = '中国卡车网'
        item['website'] = '中国卡车网'
        item['link'] = lyurl
        item['type'] = 1
        item['time'] = time
        item['content'] = content
        item['spider_name'] = 'ZGKCW'
        item['module_name'] = '产销库'
        yield item
예제 #11
0
def parse_from_html(html: str, url: str) -> dict:
    """根据 html 提取新闻内容, 该 url 仅用于返回(不做处理)"""
    extractor = GeneralNewsExtractor()
    try:
        result = extractor.extract(html, with_body_html=True)
    except Exception as e:
        raise Exception(f'Html parsing error, reason: {e}')
    result['url'] = url
    return result
예제 #12
0
파일: web.py 프로젝트: alexlint/PRJ7
def fetch(url):
    headers = {
        "User-agent":
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    try:
        response = requests.get(url=url, headers=headers).content.decode(
            'utf-8', "ignore")
        extractor = GeneralNewsExtractor()
        article_content = extractor.extract(response)
        article_content["url"] = url
    except:
        return None
    return article_content
예제 #13
0
def extract(task: ExtractTask):
    extractor = GeneralNewsExtractor()
    try:
        result = extractor.extract(
            task.html,
            title_xpath= task.title_xpath,
            author_xpath=task.author_xpath,
            publish_time_xpath=task.publish_time_xpath,
            with_body_html=task.with_body_html,
            host=task.host,
            noise_node_list=task.noise_node_list
        )
    except Exception as e:
        result = {'success': False, 'msg': str(e)}
    return result
예제 #14
0
class FK(Base):
    def __init__(self):
        super(FK, self).__init__()
        self.list_url = 'http://finance.takungpao.com/fk/'
        self.extractor = GeneralNewsExtractor()
        self.table = 'takungpao'
        self.name = '风口'
        self.fields = ['link', 'title', 'pub_date', 'article']

    def _parse_detail(self, body):
        result = self.extractor.extract(body)
        content = result.get("content")
        return content

    def _start(self):
        resp = self.get(self.list_url)
        if resp:
            body = resp.text
            doc = html.fromstring(body)
            news_list = doc.xpath(
                '//div[@class="wrap-l js-list fl_dib"]/ul/li/div[@class="list-text fr_dib"]'
            )
            # print(len(news_list))
            items = []
            for news in news_list:
                item = {}
                # print(news.text_content().split("\r\n"))
                title = news.xpath('./h1/a')[0].text_content()
                # print(title)
                item['title'] = title
                link = news.xpath('./h1/a/@href')[0]
                # print(link)
                item['link'] = link
                pub_date = news.xpath('./div[@class="date"]')[0].text_content()
                # print(pub_date)
                item['pub_date'] = pub_date

                detail_resp = self.get(link)
                if detail_resp:
                    detail_page = detail_resp.text
                    article = self._parse_detail(detail_page)
                    if article:
                        item['article'] = article
                        print(item)
                        items.append(item)
            self.save(items)
예제 #15
0
파일: ttspider.py 프로젝트: chonyie/spiders
 async def parse_new(self, id):
     browser = await launch(headless=False, args=['--disable-infobars'])
     page = await browser.newPage()
     ulls = self.url + id
     await page.goto(ulls)
     extractor = GeneralNewsExtractor()
     result = extractor.extract(await page.content())
     intab = '?/|\.><:*"'
     title = result['title']  #如果有intab字符,系统会报错(windows里有特殊用法)
     for s in intab:
         if s in title:
             title = title.replace(s, '')
     contend = result['content'].split('来源')[0]
     path = 'E:\\biyejinri\\result\\' + title + '.txt'
     if not os.path.exists(path):
         self.save_data("result/" + title + ".txt", contend)
         self.sort_data(contend)
     await browser.close()
예제 #16
0
class MeituanArticleSpider(scrapy.Spider):
    runing = False
    name = "meituan_article"

    def __init__(self, name=None, urls=None, **kwargs):
        super().__init__(name, **kwargs)
        self.extractor = GeneralNewsExtractor()
        # self.urls = urls if urls is not None else ["https://tech.meituan.com/2013/12/04/yui3-practice.html"]
        if urls is None:
            raise ValueError("url 不能为空")
        self.urls = urls
        self.count = 1
        MeituanArticleSpider.runing = True

    def start_requests(self):
        for url in self.urls:
            yield scrapy.Request(url, callback=self.parse)

    def parse(self, response: scrapy.http.response.Response):
        next_page = response.xpath(
            '//div[@class="navigation-wrapper"]/div/a[@class="next"]/@href'
        ).get()
        if next_page:
            print(next_page)
            self.count += 1
            if self.count < 20:
                yield response.follow(next_page, callback=self.parse)

        desc = response.xpath('//meta[@name="description"]/@content').get()
        tags = response.xpath('//span[@class="tag-links"]/a/text()').getall()
        res = self.extractor.extract(response.text)
        yield MeituanArticleSpiderItem(url=response.url,
                                       title=res['title'],
                                       content=res['content'],
                                       tags=tags,
                                       author=res['author'],
                                       publish_time=res['publish_time'])

    @staticmethod
    def close(spider, reason):
        MeituanArticleSpider.runing = False
        return super().close(spider, reason)
예제 #17
0
    def parse_item(self, response):
        '''
        解析文章内容
        :param response:
        :return: 返回数据Item对象
        '''

        exetractor = GeneralNewsExtractor()
        # logging.info(response.text)
        newInfo = exetractor.extract(response.text, title_xpath='//h5/text')
        item = NewsItem()
        item['title'] = response.meta['title']
        item['publish_date'] = newInfo['publish_time']
        item['url'] = response.meta['url']
        item['source_media'] = response.meta['source_media']
        item['spread_media'] = ''
        item['content'] = newInfo['content'].replace('\n', '')
        item['content_html'] = ''
        item['spider_data_file'] = self.spiderDataFile
        yield item
예제 #18
0
    def parse_items(self, response):
        lyurl = response.url
        if lyurl.find(
                'http://jamia.org.cn/index.php?g=&m=contents&a=index&term_id=31&page='
        ) < 0:
            extractor = GeneralNewsExtractor()
            resp = response.text
            result = extractor.extract(resp, with_body_html=False)

            title = response.css('h2::text').extract_first()
            txt = result['content']
            publish_time = result['publish_time']
            time = get_times(publish_time)
            item = HyxhItem()
            content_css = [
                '/html/body/div[1]/div[4]/table/tbody/tr/td[2]/table/tbody/tr[2]/td/table/tbody/tr[4]'
            ]
            for content in content_css:
                content = ''.join(response.xpath(content).extract())
                if content:
                    break
                if not content:
                    logging.warning(f'{response.url}' +
                                    '当前url无 css 适配未提取 centent')
            item['title'] = title
            appendix, appendix_name = get_attachments(response)
            item['appendix'] = appendix
            item['source'] = '江苏省新材料产业协会'
            item['website'] = '江苏省新材料产业协会'
            item['link'] = lyurl
            item['appendix_name'] = appendix_name
            item['type'] = 1
            item['tags'] = ''
            item['time'] = time
            item['content'] = content
            item['txt'] = txt
            item['spider_name'] = 'jiangsu'
            item['module_name'] = '行业协会'
            yield item
예제 #19
0
    def parse_items(self, response):
        extractor = GeneralNewsExtractor()
        resp = response.text
        result = extractor.extract(resp, with_body_html=False)

        title = result['title']
        txt = result['content']
        publish_time = result['publish_time']
        time = get_times(publish_time)
        item = HyxhItem()
        print(response.url)
        content_css = [
            '.MsoNormal',
            '#rightcol p',
        ]
        lyurl = response.url
        for content in content_css:
            content = ''.join(response.css(content).extract())
            if content:
                break
            if not content:
                logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
        item['title'] = title
        appendix, appendix_name = get_attachments(response)
        item['appendix'] = appendix
        item['source'] = '深圳市医疗器械行业协会'
        item['website'] = '深圳市医疗器械行业协会'
        item['link'] = lyurl
        item['appendix_name'] = appendix_name
        item['type'] = 1
        item['tags'] = ''
        item['time'] = time
        item['content'] = content
        item['txt'] = txt
        item['spider_name'] = 'shenzhen'
        item['module_name'] = '行业协会'
        yield item
예제 #20
0
def main():
    driver = webdriver.Chrome('/home/mi/tools/chromedriver')
    driver.get(
        'https://mp.weixin.qq.com/s?__biz=MzUyNDM4NDc4NA==&mid=2247501581&idx=1&sn=f26c6238cae565804512fe637f55fca5&chksm=fa2ca872cd5b2164800820778d22305eb77bb91f2213cd9f7ecc97e33818a8ea9b07f01ec158&scene=21#wechat_redirect'
    )
    time.sleep(3)
    extractor = GeneralNewsExtractor()
    result = extractor.extract(driver.page_source, with_body_html=True)
    print(result)
    #  json.dumps(): 对数据进行编码。
    # json.loads(): 对数据进行解码。
    res = json.dumps(result)
    data = json.loads(res)
    title = data['title']
    content = data['content']
    images = data['images']
    print("images = : " + str(len(images)))
    for image in images:
        print(image)
    content_list = content.split('\n')
    insert_factor = len(content_list) // len(images)
    print("insert_factor = : " + str(insert_factor))
    print("content_list = : " + str(len(content_list)))
    write_title(title)
    dir_name = title.replace(' ', '')
    save_file_to_local(dir_name, images)
    for content in content_list:
        index = content_list.index(content)
        picture_index = index // insert_factor
        if index > 0 and index % insert_factor == 0 and picture_index < len(
                images):
            write_picture(str(picture_index))
        print(content)
        write_paragraph(content)

    driver.quit()
    document.save('demo.docx')
예제 #21
0
 def parse_items(self, response):
     extractor = GeneralNewsExtractor()
     resp = response.text
     result = extractor.extract(resp, with_body_html=False)
     title = response.xpath(
         '/html/body/div[4]/div/div/div[2]/div[3]/div[1]/span/text()'
     ).extract_first()
     txt = result['content']
     publish_time = ''.join(
         response.xpath(
             '/html/body/div[4]/div/div/div[2]/div[3]/div[2]/text()').
         extract())
     time = get_times(publish_time)
     item = HyxhItem()
     content_css = ['/html/body/div[4]/div/div/div[2]/div[3]/div[4]']
     lyurl = response.url
     for content in content_css:
         content = ''.join(response.xpath(content).extract())
         if content:
             break
         if not content:
             logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
     item['title'] = title
     appendix, appendix_name = get_attachments(response)
     item['appendix'] = appendix
     item['source'] = '上海市生物医药协会'
     item['website'] = '上海市生物医药协会'
     item['link'] = lyurl
     item['appendix_name'] = appendix_name
     item['type'] = 1
     item['tags'] = ''
     item['time'] = time
     item['content'] = content
     item['txt'] = txt
     item['spider_name'] = 'shanghai_sbia'
     item['module_name'] = '行业协会'
     yield item
예제 #22
0
 def parse_items(self, response):
     lyurl = response.url
     extractor = GeneralNewsExtractor()
     resp = response.text
     result = extractor.extract(resp, with_body_html=False)
     title = response.css('h2::text').extract_first()
     txt = result['content']
     publish_time = response.xpath(
         '//div[@class="main2 ma clear"]/span[@class="riqi_1"][1]/text()'
     ).extract_first()
     # publish_time = response.xpath('/html/body/div[4]/span[1]/text()').extract_first()
     print('publish_time:' + str(publish_time))
     time = get_times(publish_time)
     item = HyxhItem()
     content_css = ['.para.ma']
     for content in content_css:
         content = ''.join(response.css(content).extract())
         if content:
             break
         if not content:
             logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
     item['title'] = title
     appendix, appendix_name = get_attachments(response)
     item['appendix'] = appendix
     item['source'] = '上海市船舶与海洋工程学会'
     item['website'] = '上海市船舶与海洋工程学会'
     item['link'] = lyurl
     item['appendix_name'] = appendix_name
     item['type'] = 1
     item['tags'] = ''
     item['time'] = time
     item['content'] = content
     item['txt'] = txt
     item['spider_name'] = 'shanghai_Aship'
     item['module_name'] = '行业协会'
     yield item
예제 #23
0
class ZhongGuoJingJi(Base):
    def __init__(self):
        super(ZhongGuoJingJi, self).__init__()
        self.name = '中国经济'
        self.first_url = 'http://www.takungpao.com/finance/236132/index.html'
        self.format_url = 'http://www.takungpao.com/finance/236132/{}.html'
        self.page = 10
        self.fields = ['pub_date', 'link', 'title', 'article']
        self.table = 'takungpao'
        self.extractor = GeneralNewsExtractor()

    def _process_pub_dt(self, pub_date):
        # 对 pub_date 的各类时间格式进行统一
        current_dt = datetime.datetime.now()
        yesterday_dt_str = (datetime.datetime.now() -
                            datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        after_yesterday_dt_str = (
            datetime.datetime.now() -
            datetime.timedelta(days=2)).strftime("%Y-%m-%d")
        if "小时前" in pub_date:  # eg. 20小时前
            hours = int(pub_date.replace('小时前', ''))
            pub_date = (
                current_dt -
                datetime.timedelta(hours=hours)).strftime("%Y-%m-%d %H:%M:%S")
        elif "昨天" in pub_date:  # eg. 昨天04:24
            pub_date = pub_date.replace('昨天', '')
            pub_date = " ".join([yesterday_dt_str, pub_date])
        elif '前天' in pub_date:  # eg. 前天11:33
            pub_date = pub_date.replace("前天", '')
            pub_date = " ".join([after_yesterday_dt_str, pub_date])
        else:  # eg. 02-29 04:24
            pub_date = str(current_dt.year) + '-' + pub_date
        # print(pub_date)
        return pub_date

    def _parse_detail(self, link):
        detail_resp = self.get(link)
        if detail_resp:
            body = detail_resp.text
            result = self.extractor.extract(body)
            content = result.get("content")
            return content

    def _parse_list(self, list_url):
        items = []
        list_resp = self.get(list_url)
        if list_resp:
            list_page = list_resp.text
            doc = html.fromstring(list_page)
            news_list = doc.xpath(
                '//div[@class="wrap_left"]/dl[@class="item clearfix"]')
            for news in news_list:
                item = {}
                link = news.xpath('./dd[@class="intro"]/a/@href')[0]
                # print(link)
                item['link'] = link

                title = news.xpath("./dd/a/@title")[0]
                # print(title)
                item['title'] = title

                pub_date = news.xpath("./dd[@class='sort']/text()")[0]
                pub_date = self._process_pub_dt(pub_date)
                # print(pub_date)
                item['pub_date'] = pub_date

                article = self._parse_detail(link)
                if article:
                    article = self._process_content(article)
                    item['article'] = article
                    print(item)
                    items.append(item)
        return items

    def _start(self):

        for page in range(1, self.page + 1):
            print("page >>>", page)
            if page == 1:
                list_url = self.first_url
            else:
                list_url = self.format_url.format(page)
            items = self._parse_list(list_url)
            self.save(items)
예제 #24
0
class BaseSpider(object):
    def __init__(self):
        # 请求头
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        }
        # 是否在本地运行
        self.local = LOCAL
        # shodua
        self.use_js = True

        # selenium 的 Chrome 的相关配置
        if not self.use_js:
            if self.local:
                self.browser = webdriver.Chrome()
            else:
                self._check_selenium_status()
                self.browser = webdriver.Remote(
                    command_executor="http://chrome:4444/wd/hub",
                    desired_capabilities=DesiredCapabilities.CHROME)
            self.browser.implicitly_wait(5)
        if self.local:
            conf = {
                "host": LOCAL_MYSQL_HOST,
                "port": LOCAL_MYSQL_PORT,
                "user": LOCAL_MYSQL_USER,
                "password": LOCAL_MYSQL_PASSWORD,
                "db": LOCAL_MYSQL_DB,
            }
            self.db = LOCAL_MYSQL_DB
        else:
            conf = {
                "host": MYSQL_HOST,
                "port": MYSQL_PORT,
                "user": MYSQL_USER,
                "password": MYSQL_PASSWORD,
                "db": MYSQL_DB,
            }
            self.db = MYSQL_DB
        self.sql_client = PyMysqlBase(**conf)
        self.extractor = GeneralNewsExtractor()

        self.error_list = []
        self.error_detail = []

    def _check_selenium_status(self):
        """
        检查 selenium 服务端的状态
        """
        while True:
            i = 0
            try:
                resp = requests.get("http://chrome:4444/wd/hub/status",
                                    timeout=0.5)
            except:
                i += 1
                if i > 10:
                    raise
            else:
                logger.info(resp.text)
                break

    def __del__(self):
        try:
            self.browser.close()
        except:
            pass

    def fetch_page(self, url):
        if self.use_js:
            return self.js_get_page(url)

        retry = 2
        try:
            self.browser.get(url)
            page = self.browser.page_source
        except:
            retry -= 1
            if retry < 0:
                return
            print('Crawling Failed', url)
            print('try to fetch again')
            time.sleep(3)
            return self.fetch_page(url)
        else:
            return page

    def _get_refer_url(self, body):
        """获取重定向之后的网址"""
        doc = html.fromstring(body)
        script_content = doc.xpath("//script")[0].text_content()
        re_str = r"var(.+?).split"
        ret = re.findall(re_str, script_content)[0]
        # print("正则结果: ", ret)
        ret = ret.lstrip("|(")
        ret = ret.rstrip("')")
        ret_lst = ret.split("|")
        names = ret_lst[0::2]
        params = ret_lst[1::2]
        info = dict(zip(names, params))
        factor = sum([ord(ch) for ch in info.get("wzwsquestion")]) * int(
            info.get("wzwsfactor")) + 0x1b207
        raw = f'WZWS_CONFIRM_PREFIX_LABEL{factor}'
        refer_url = info.get(
            "dynamicurl") + '?wzwschallenge=' + base64.b64encode(
                raw.encode()).decode()
        return "http://www.pbc.gov.cn" + refer_url

    def js_get_page(self, url):
        s = requests.Session()
        h1 = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.9,en;q=0.8',
            'Cache-Control':
            'no-cache',
            'Connection':
            'keep-alive',
            'Host':
            'www.pbc.gov.cn',
            'Pragma':
            'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
        }
        resp1 = s.get(url, headers=h1)
        cookie1 = resp1.headers.get("Set-Cookie").split(";")[0]
        origin_text = resp1.text
        redirect_url = self._get_refer_url(origin_text)
        h1.update({
            'Cookie':
            cookie1,
            'Referer':
            'http://www.pbc.gov.cn/goutongjiaoliu/113456/113469/11040/index1.html',
        })
        resp2 = s.get(redirect_url, headers=h1)
        text = resp2.text.encode("ISO-8859-1").decode("utf-8")
        return text

    def gne_parse_detail(self, page):
        result = self.extractor.extract(page)
        content = result.get("content")
        return content

    def contract_sql(self, to_insert):
        ks = []
        vs = []
        for k in to_insert:
            ks.append(k)
            vs.append(to_insert.get(k))
        fields_str = "(" + ",".join(ks) + ")"
        values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")"
        base_sql = '''INSERT INTO `{}`.`{}` '''.format(
            self.db,
            self.table) + fields_str + ''' values ''' + values_str + ''';'''
        return base_sql, tuple(vs)

    def _process_item(self, item):
        return item

        # item.update({"article": self._process_content(item.get("article"))})
        # return item

    def _process_content(self, vs):
        # 去除 4 字节的 utf-8 字符,否则插入mysql时会出错
        try:
            # python UCS-4 build的处理方式
            highpoints = re.compile(u'[\U00010000-\U0010ffff]')
        except re.error:
            # python UCS-2 build的处理方式
            highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')

        params = list()
        for v in vs:
            # 对插入数据进行一些处理
            nv = highpoints.sub(u'', v)
            nv = self._filter_char(nv)
            params.append(nv)
        return "".join(params)

    def _filter_char(self, test_str):
        # 处理特殊的空白字符
        # '\u200b' 是 \xe2\x80\x8b
        for cha in [
                '\n',
                '\r',
                '\t',
                '\u200a',
                '\u200b',
                '\u200c',
                '\u200d',
                '\u200e',
                '\u202a',
                '\u202b',
                '\u202c',
                '\u202d',
                '\u202e',
        ]:
            test_str = test_str.replace(cha, '')
        test_str = test_str.replace(u'\xa0', u' ')  # 把 \xa0 替换成普通的空格
        return test_str

    def save(self, item):
        to_insert = self._process_item(item)
        insert_sql, values = self.contract_sql(to_insert)

        try:
            ret = self.sql_client.insert(insert_sql, values)
        except pymysql.err.IntegrityError:
            logger.warning("重复", to_insert)
            return 1
        except:
            traceback.print_exc()
        else:
            return ret

    def _get_page_url(self, page_num):
        if self.start_url:
            return self.start_url.format(page_num)
        elif page_num == 1:
            return self.first_url
        else:
            return self.format_url.format(page_num)

    def process_list(self, page_num):
        page_url = self._get_page_url(page_num)
        list_retry = 2
        while True:
            try:
                list_page = self.fetch_page(page_url)
                if list_page:
                    items = self._parse_list_page(list_page)
                else:
                    raise
            except:
                print("list page {} retry ".format(page_num))
                list_retry -= 1
                if list_retry < 0:
                    return
                # self.process_list(page_num)
            else:
                return items

    def process_detail(self, link):
        detail_retry = 2
        while True:
            try:
                detail_page = self.fetch_page(link)
                if detail_page:
                    article = self._parse_detail_page(detail_page)
                else:
                    raise
            except:
                print("detail page {} retry ".format(link))
                detail_retry -= 1
                if detail_retry < 0:
                    return
            else:
                return article

    def _start(self, page_num):
        print("page num is {}\n".format(page_num))
        items = self.process_list(page_num)
        print(items)
        if items:
            for item in items:
                link = item["link"]
                article = self.process_detail(link)
                if article:
                    item['article'] = article
                    # print(item)
                    ret = self.save(item)
                    if not ret:
                        self.error_detail.append(item.get("link"))
                else:
                    self.error_detail.append(link)
        else:
            self.error_list.append(self._get_page_url(page_num))
예제 #25
0
import json
import glob
from gne import GeneralNewsExtractor

if __name__ == '__main__':
    html_list = glob.glob('**/*/*.html', recursive=True)
    for html_file in html_list:
        with open(html_file, encoding='utf-8') as f:
            html = f.read()
        extractor = GeneralNewsExtractor()
        result = extractor.extract(html,
                                   host='https://www.xxx.com',
                                   noise_node_list=[
                                       '//div[@class="comment-list"]',
                                       '//*[@style="display:none"]',
                                   ])
        print(f'>>>>>>>>>>>>>{html_file}>>>>>>>>>>>>>')
        print(json.dumps(result, indent=2, ensure_ascii=False))
        print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
def get_data_dict(news_url):
    extractor = GeneralNewsExtractor()
    html = get_response_text(news_url)
    result = extractor.extract(html)
    return result
예제 #27
0
class CN4Hours(SpiderBase):
    # 适用于 上证四小时
    def __init__(self):
        super(CN4Hours, self).__init__()
        self.list_url = "http://app.cnstock.com/api/theme/get_theme_list?"
        self.extractor = GeneralNewsExtractor()
        self.table_name = "cn_stock"
        self.name = '上证四小时'
        self.fields = ['pub_date', 'title', 'link', 'article']

    def _create_table(self):
        self._spider_init()
        sql = '''
        CREATE TABLE  IF NOT EXISTS `{}` (
          `id` int(11) NOT NULL AUTO_INCREMENT,
          `pub_date` datetime NOT NULL COMMENT '发布时间',
          `title` varchar(64) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '文章标题',
          `link` varchar(128) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '文章详情页链接',
          `article` text CHARACTER SET utf8 COLLATE utf8_bin COMMENT '详情页内容',
          `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
          `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
          PRIMARY KEY (`id`),
          UNIQUE KEY `link` (`link`),
          KEY `pub_date` (`pub_date`),
          KEY `update_time` (`UPDATETIMEJZ`)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='上海证券报'; 
        '''.format(self.table_name)
        self.spider_client.insert(sql)
        self.spider_client.end()

    def make_query_params(self, latest_id):
        """
        拼接动态请求参数
        """
        query_params = {
            'maxid':
            str(0),
            'minid':
            str(latest_id),  # 这个越大的是越新的内容
            'size':
            5,
            'callback':
            'jQuery{}_{}'.format(
                ''.join(random.choice(string.digits) for i in range(0, 20)),
                str(int(time.time() * 1000))),
            '_':
            str(int(time.time() * 1000)),
        }
        return query_params

    def get_zhaiyao(self, url):
        try:
            page = req.get(url, headers=self.headers).text
            doc = html.fromstring(page)
            detail_link = doc.xpath("//div[@class='tcbhd-r']//h1/a/@href")[0]
            return detail_link
        except:
            return None

    def get_detail(self, detail_url):
        try:
            page = req.get(detail_url, headers=self.headers).text
            result = self.extractor.extract(page)
            content = result.get("content")
            return content
        except:
            return None

    def get_count(self):
        params = self.make_query_params(0)
        url = self.list_url + urlencode(params)
        ret = req.get(url, headers=self.headers).text
        json_data = re.findall(r'jQuery\d{20}_\d{13}\((\{.*?\})\)', ret)[0]
        py_data = json.loads(json_data)
        count = py_data.get("item")[0].get("order")
        return count + 1

    def get_list(self):
        count = self.get_count()
        print("网页个数: ", count)
        items = []
        for latest_id in range(count, 0, -5):
            params = self.make_query_params(latest_id)
            url = self.list_url + urlencode(params)
            ret = req.get(url, headers=self.headers).text
            logger.info(ret)
            json_data = re.findall(r'jQuery\d{20}_\d{13}\((\{.*?\})\)', ret)[0]
            py_data = json.loads(json_data)
            datas = py_data.get("item")
            if not datas:
                break
            for one in datas:
                item = dict()
                item['pub_date'] = one.get("datetime")
                item['title'] = one.get("title")
                item[
                    'zhaiyao'] = 'http://news.cnstock.com/theme,{}.html'.format(
                        one.get("id"))
                items.append(item)
        return items

    def start(self):
        self._create_table()

        self._spider_init()
        items = self.get_list()
        nitems = []
        for item in items:
            zhaiyao_link = item.get('zhaiyao')
            detail_url = self.get_zhaiyao(zhaiyao_link)
            if detail_url:
                item['link'] = detail_url
                item['article'] = self.get_detail(detail_url)
                item.pop("zhaiyao")
                print(item)
                nitems.append(item)

        print("数据量 : ", len(nitems))
        ret = self._batch_save(self.spider_client, nitems, self.table_name,
                               self.fields)
        print("插入个数: ", ret)
예제 #28
0
#!/usr/bin/python3
# -*- coding:utf-8 -*-
"""
@author: lms
@file: gne.py
@time: 2020/2/27 21:50
@desc:
"""
from gne import GeneralNewsExtractor

# news_url: https://news.163.com/20/0222/19/F610K69R00019B3E.html
html = open('news.html', 'r+', encoding='utf-8').read()

extractor = GeneralNewsExtractor()
result = extractor.extract(html, with_body_html=True)
# res_json = json.dumps(result, ensure_ascii=False)
for k, v in result.items():
    print(k, v)
예제 #29
0
import os
from gne import GeneralNewsExtractor

filename = []
fp = open("/home/ianliu/develope/maintex_Extraction/mytest.txt", "r")
lines = fp.readlines()

for line in lines:
    line = line.split("\n")[0]
    filename.append(line)


extractor = GeneralNewsExtractor()
for f in filename:
    html = open(f, "r")
    print(f)
    result = extractor.extract(html.read())
    '''print(f.split(".")[0].split("/") )'''
    wf = open("/home/ianliu/develope/maintex_Extraction/python/output/" +  f.split(".")[0].split("/")[8]  + ".txt", "w")
    wf.write(result['content'])
    wf.close()
    html.close()
예제 #30
0
class Reference(SpiderBase):
    def __init__(self):
        super(Reference, self).__init__()
        self.index_url = 'http://www.jfinfo.com/reference'
        self.more_url = 'http://www.jfinfo.com/articles_categories/more?page={}&category_id=13'
        self.headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Cookie':
            'Hm_lvt_eb1542e7fa53709d9037dcc8c652e026=1583204516; Hm_lpvt_eb1542e7fa53709d9037dcc8c652e026=1583205739; _jfinfo_session=SzdiRTlIeUw5QXdObkRSNG5kUGpVRDNCQld3NGVkcTcrWnVNR3dZdTA4TWxoRVd3VENkQlBTeHcxQkdGaS9nUG9qdDVEeFlqMEI1OFdQMmdYNXJLTyt0YzJjRkRVbEVKa25YOUQvWUl5RjZFTm5WbENuN1JLZ05RSFR4cEVYVW90alhpSGNHSldiYWlZMDNXR0NuK293PT0tLWJwd2UybVpjREltRHB1bUxMdUxBZ2c9PQ%3D%3D--4ef0e46e0b2629bbf61194ceefd60e8b6b398499',
            'Host': 'www.jfinfo.com',
            'Pragma': 'no-cache',
            'Referer': 'http://www.jfinfo.com/reference',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
            'X-CSRF-Token':
            '9Kgn0ZaNQJWoqIht/pDIK9h97D5wuSFQ4gbSV8eB3eeXm3BVKjz7g8kDflyf0G4LssxDAOa0J297e6x5aKPndQ==',
            'X-Requested-With': 'XMLHttpRequest',
        }
        self.extractor = GeneralNewsExtractor()
        self.table_name = 'jfinfo'  # 巨丰资讯
        self.fields = ['link', 'title', 'pub_date', 'article']
        self.max_page = 2
        self.name = '巨丰内参'

    def get(self, url):
        return requests.get(url, headers=self.headers)

    def _create_table(self):
        self._spider_init()
        sql = '''
        CREATE TABLE IF NOT EXISTS `{}`(
          `id` int(11) NOT NULL AUTO_INCREMENT,
          `pub_date` datetime NOT NULL COMMENT '发布时间',
          `title` varchar(64) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '文章标题',
          `link` varchar(128) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '文章详情页链接',
          `article` text CHARACTER SET utf8 COLLATE utf8_bin COMMENT '详情页内容',
          `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
          `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
          PRIMARY KEY (`id`),
          UNIQUE KEY `link` (`link`),
          KEY `pub_date` (`pub_date`)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='巨丰财经';
        '''.format(self.table_name)
        self.spider_client.insert(sql)
        self.spider_client.end()

    def _parse_detail(self, body):
        result = self.extractor.extract(body)
        content = result.get("content")
        return content

    def _parse_index(self, index_page):
        doc = html.fromstring(index_page)
        news_list = doc.xpath("//div[@class='m-contentl left']//dl")
        items = []
        for news in news_list:
            item = {}
            title = news.xpath(".//a[@class='f20']/text()")[0]
            item['title'] = title
            link = news.xpath(".//a[@class='f20']/@href")[0]
            item['link'] = link

            _year = None
            try:
                _year = re.findall(r"news/(\d+)/", link)[0][:4]  # 20161218
            except:
                pass

            pub_date = news.xpath(".//dd/span/text()")[0]
            pub_date = self._process_pub_dt(pub_date, _year)
            item['pub_date'] = pub_date
            detail_resp = self.get(link)
            if detail_resp:
                detail_page = detail_resp.text
                article = self._parse_detail(detail_page)
                item['article'] = article
                items.append(item)
                # print(item)
        return items

    def _parse_more(self, more_page):
        '''
        if(!$("#bottom_load_error").data("block")){
            $("#page_num").val("2");
            $(".m-newsYaow").append('<div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2765134\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3153843/cfb2d9fa-0cbc-4a0e-99cb-be9232aad421.jpg\" alt=\"Cfb2d9fa 0cbc 4a0e 99cb be9232aad421\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2765134\">巨·个股 | 市场规模940亿!两大核心龙头已被保险巨头看中,互联网医疗大机遇<\/a>\n      <p class=\"cGray\">字字真言的选股、操作理念;价值千金的市场热点、龙头捕捉技法!应有尽有。学会炒股、洞悉热点。捕捉龙头,一份“巨·个股”就够了!导读:互联网医疗行业景气度大大提升,产业链哪些个股受益,还有没有参与的...<\/p>\n    <\/dt>\n    <dd><i>巨丰日刊<\/i><span>昨天15:30<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2764986\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3153695/b8bb5b8d-fc85-4c43-b607-a11c3960bd04.jpg\" alt=\"B8bb5b8d fc85 4c43 b607 a11c3960bd04\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2764986\">巨丰数据赢|百强席位操盘手法解密(一百四十七):同样是打板 有些席位就要回避<\/a>\n      <p class=\"cGray\">【巨丰投顾】大数据研发团队依托四大数据库,针对市场百强龙虎榜营业部全面进行数据回测分析,帮助分析师、投资顾问、专业投资者解析各路一线游资操盘手法,助您看清对手“底牌”,数据回测告诉我们,一旦一线...<\/p>\n    <\/dt>\n    <dd><i>百强席位<\/i><span>昨天15:19<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2764983\"><img src=\"https://asset3.tougub.com/assets/default/article/cover/77-7c3d31472851be786c76384e4d30292eb77133f949a42fc573c212f8901240fc.jpg\" alt=\"77\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2764983\">巨·财经 | A股“宏观因子指标”探明底部?<\/a>\n      <p class=\"cGray\">追踪最新鲜的财经事件,探寻热点背后的投资机会。巨丰投顾“巨·财经”为您专业解读财经事件背后的投资秘密。导读:2月财新中国制造业PMI降至40.3,为有数据以来最低,A股“宏观因子指标”探明底部?...<\/p>\n    <\/dt>\n    <dd><i>巨丰日刊<\/i><span>昨天15:00<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2764979\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3153686/91349942-0a8b-48be-8eb0-f0efd797dc1b.png\" alt=\"91349942 0a8b 48be 8eb0 f0efd797dc1b\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2764979\">头号研报:中长期贷款增加 大基建投资提速带来结构性机会<\/a>\n      <p class=\"cGray\">“研报也要做头号”——巨丰投顾最新栏目“头号研报”正式上线。在众多研报之中,我们通过层层对比和筛选分析,经过提炼、加工,每周精选3-6篇有质量的文章,以“带你读研报”为目的,力争通过研报学习,挖掘市场投资机。\n<\/p>\n    <\/dt>\n    <dd><i>头号研报<\/i><span>昨天13:26<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2764261\"><img src=\"https://asset3.tougub.com/assets/default/article/cover/12-501b4dfb44d6a1942b6c9ea4df1253500c9b3b3f9f58291e4821d791a3464af8.jpg\" alt=\"12\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2764261\">巨丰投顾独家解读:非洲猪瘟疫苗创制成功<\/a>\n      <p class=\"cGray\">事件:中国农业科学院哈尔滨兽医研究所在《中国科学:生命科学》英文版在线发表研究论文,报道了一株人工缺失七个基因的非洲猪瘟弱毒活疫苗对家猪具有良好的安全性和有效性。巨丰投顾指出,非洲猪瘟疫苗创制成...<\/p>\n    <\/dt>\n    <dd><i>独家解读<\/i><span>昨天10:58<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2764260\"><img src=\"https://asset2.tougub.com/assets/default/article/cover/63-d2a497ff78220446101ea781908b645df896ec412cae8f785a4a517c5dd5cd1e.jpg\" alt=\"63\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2764260\">巨丰投顾独家解读:沪指涨逾2% 水泥等板块指数涨逾5%<\/a>\n      <p class=\"cGray\">事件:沪指涨逾2%,创业板指涨近1.2%,水泥、医废处理、建筑装饰等板块指数涨逾5%。巨丰投顾指出,基建板块大涨主要源于疫情对经济冲击之下基建稳增长预期。最新PMI数据显示2月财新制造业PMI降...<\/p>\n    <\/dt>\n    <dd><i>独家解读<\/i><span>昨天10:55<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763710\"><img src=\"https://asset3.tougub.com/assets/default/article/cover/12-501b4dfb44d6a1942b6c9ea4df1253500c9b3b3f9f58291e4821d791a3464af8.jpg\" alt=\"12\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763710\">巨丰投顾独家解读:2月财新中国制造业PMI40.3 为有数据以来最低 <\/a>\n      <p class=\"cGray\">事件:2月财新中国制造业PMI降至40.3 为有数据以来最低。巨丰投顾指出,各行业受影响程度存在差异,保证民生需求的农副食品加工、食品及酒饮料精制茶等行业PMI明显高于制造业整体水平。为减少疫情...<\/p>\n    <\/dt>\n    <dd><i>独家解读<\/i><span>昨天10:15<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763709\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3152414/e1fa1fb0-8f1e-401b-96a6-4ccff235a1f8.jpg\" alt=\"E1fa1fb0 8f1e 401b 96a6 4ccff235a1f8\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763709\">外盘头条:国际油价跌跌不休?得看OPEC出不出手<\/a>\n      <p class=\"cGray\">全球财经媒体周末共同关注的头条新闻主要有:1、特朗普政府考虑减税措施 施压美联储增加降息可能2、央行研究人员:全球经济“V”型复苏看起来“非常不现实”3、美银美林:衰退忧虑升级 投资者逃离股市转...<\/p>\n    <\/dt>\n    <dd><i>海外观察<\/i><span>昨天09:46<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763462\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3152166/884186f3-5c1d-47ae-af01-15ff367b365d.jpg\" alt=\"884186f3 5c1d 47ae af01 15ff367b365d\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763462\">大参考:美股半导体板块引领市场 全球半导体出货有望再超1万亿件<\/a>\n      <p class=\"cGray\">今日导读1、新证券法3月1日起正式实施。3月1日起公司债券公开发行实行注册制,沪深交易所表示,加快制定公司债券实施注册制配套规则。2、截至北京时间3月1日20时,中国以外共61个国家和地区报告新...<\/p>\n    <\/dt>\n    <dd><i>大参考<\/i><span>昨天09:21<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763461\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3152165/7448a268-08b4-494e-9de2-dc579c2d48a5.jpg\" alt=\"7448a268 08b4 494e 9de2 dc579c2d48a5\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763461\">巨丰数据赢|北上资金持仓曝光 上周主力加仓个股出炉<\/a>\n      <p class=\"cGray\">北上资金增持个股数量上周五小幅回升。其中上海主板个股被北上资金减持占比最大。从板块方面看,建筑装饰和农业板块被资金小幅增持。而轻工制造和家电板块被资金减持力度较为大,需注意。 从上一周数据统计看...<\/p>\n    <\/dt>\n    <dd><i>巨丰数据赢<\/i><span>昨天09:19<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763458\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3152161/d9d8aaf9-450b-40be-a450-5904578f2b58.jpg\" alt=\"D9d8aaf9 450b 40be a450 5904578f2b58\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763458\">巨丰数据赢|北上资金上周流出 主力却逆市买入这些股<\/a>\n      <p class=\"cGray\">北上资金上周五流出24.26亿元(沪深股通使用额度),昨日成交净额(买入额-卖出额)约为-51.37亿元,继续以流出为主。从沪深港股通十大活跃股表现看,被北上资金买入金额靠前的个股主要有闻泰科技...<\/p>\n    <\/dt>\n    <dd><i>巨丰数据赢<\/i><span>昨天09:14<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763453\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3152155/a77bf6ac-3537-42e2-b439-4e635fb20bb0.jpg\" alt=\"A77bf6ac 3537 42e2 b439 4e635fb20bb0\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763453\">金股预测早间版:8股有望开启估值修复<\/a>\n      <p class=\"cGray\">根据A股前一交易日的市场表现,以及沪深交易所的公告信息,财务报表以及市场热点等多方面内容,巨丰投顾甄选出近期市场强势热门股,以供投资者参考。公告掘金1、三夫户外(002780):子公司收购得清纳...<\/p>\n    <\/dt>\n    <dd><i>金股早间版<\/i><span>昨天08:38<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200302/2763233\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3151933/a2ca6466-ebe7-4b02-a6bc-eb6bc3893b5b.jpg\" alt=\"A2ca6466 ebe7 4b02 a6bc eb6bc3893b5b\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200302/2763233\">巨丰早参:新证券法实施首日 “债券注册制”落地<\/a>\n      <p class=\"cGray\">巨丰今日策略巨丰投顾认为经过连续上涨,创业板春节后已经走出一波技术性牛市。政策利好以及流动性是推动市场不断上涨的主要因素。市场连续上涨后积累了大量的获利筹码,尤其是创业板短线涨幅接近30%,有调...<\/p>\n    <\/dt>\n    <dd><i>巨丰早参<\/i><span>昨天07:35<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200301/2762886\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3151586/ce4a8769-37e4-4693-ae91-c8528732b257.jpg\" alt=\"Ce4a8769 37e4 4693 ae91 c8528732b257\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200301/2762886\">财闻点金:多地加快互联网医院建设 供需两旺推动行业爆发<\/a>\n      <p class=\"cGray\">要闻精选1.国务院办公厅近日发布通知,明确在不同市场和板块分步骤实施股票公开发行注册制。相关板块/市场注册制改革正式落地前,仍继续实施核准制。2.3月1日,发改委、证监会、沪深交易所等部门均发布...<\/p>\n    <\/dt>\n    <dd><i>财闻点金<\/i><span>03-01 22:15<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200301/2762681\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3151381/8aba9c10-3af6-4374-b968-d7b1af196536.jpeg\" alt=\"8aba9c10 3af6 4374 b968 d7b1af196536\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200301/2762681\">3月1日晚间上市公司十大重磅公告<\/a>\n      <p class=\"cGray\">3月1日晚间,沪深两市多家上市公司发布重要公告:晨光生物回购资金总额上调为不低于2亿元且不超4亿元;格力电器拟注册发行债务融资工具 额度合计不超180亿元;傲农生物2月生猪销售量7.34万头 销售量环比增长77.04%。\n\n<\/p>\n    <\/dt>\n    <dd><i>晚间十大公告<\/i><span>03-01 19:03<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200229/2761494\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3150192/a3f8aee7-2933-4021-8b88-7a07aa82cf47.jpg\" alt=\"A3f8aee7 2933 4021 8b88 7a07aa82cf47\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200229/2761494\">巨丰访谈:外围市场重创 下周谁将成为人气王?<\/a>\n      <p class=\"cGray\">本周外围市场重创,A股冲高回落,而市场交投热情爆表,连续8天日成交额超万亿,下周3月开局会如何演绎?央行副行长表示对普惠金融服务达标的银行择机定向降准,定向降准对市场将产生何种影响?.........<\/p>\n    <\/dt>\n    <dd><i>巨丰访谈<\/i><span>02-29 09:04<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200228/2760430\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3149128/4f1c3c57-79d0-44bf-b473-5f7a1848ae4a.png\" alt=\"4f1c3c57 79d0 44bf b473 5f7a1848ae4a\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200228/2760430\">云丰晚报:中国经济总量逼近100万亿大关<\/a>\n      <p class=\"cGray\"><\/p>\n    <\/dt>\n    <dd><i>港股资讯<\/i><span>02-28 20:01<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200228/2760428\"><img src=\"https://asset2.tougub.com/assets/default/article/cover/51-3270b5ba9d90b46ff3c1df0e21b0cf4ca7c6e9e9a5c32580e39725a8b691bd73.jpg\" alt=\"51\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200228/2760428\">2月28日晚间上市公司十大重磅公告<\/a>\n      <p class=\"cGray\">2月28日晚间,沪深两市多家上市公司发布重要公告:万集科技受益ETC建设,2019年净利同比增逾125倍;音飞储存控股股东筹划股权转让事项,或导致公司控制权变更;新宙邦2月11日起陆续复工,目前...<\/p>\n    <\/dt>\n    <dd><i>晚间十大公告<\/i><span>02-28 19:21<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200228/2760426\"><img src=\"https://asset2.tougub.com/assets/default/article/cover/41-f71e7e8f2419e4a1b6634dfcbbd11c4b654f914a3ffa78d616bf0f48bc140e5c.jpg\" alt=\"41\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200228/2760426\">新的导火索出现!布油跌破50美元关口,欧股暴跌逾4%<\/a>\n      <p class=\"cGray\">周五午后,欧洲股市、贵金属和原油加速下跌。全球疫情升级、土俄方面的新消息都在加剧市场的恐慌情绪。CBOE恐慌指数VIX日内涨幅逾20%,与欧债危机时期相当,已高出2015和2018年抛售期间的高...<\/p>\n    <\/dt>\n    <dd><i>海外观察<\/i><span>02-28 18:15<\/span><\/dd>\n  <\/dl>\n<\/div><div class=\"slide\">\n  <div class=\"img\"><a href=\"https://www.jfinfo.com/news/20200228/2760425\"><img src=\"https://jfinfo.oss-cn-beijing.aliyuncs.com/file/upload/article/cover/3149123/5a8ca9e4-d530-4539-b261-e92c627ccf7c.png\" alt=\"5a8ca9e4 d530 4539 b261 e92c627ccf7c\" /><\/a><\/div>\n  <dl>\n    <dt><a class=\"f20\" target=\"_blank\" href=\"https://www.jfinfo.com/news/20200228/2760425\">金股预测晚间版:​南卫股份等3股后市备受关注<\/a>\n      <p class=\"cGray\">南卫股份(603880) 技术突破 ★★★投资要点:江苏南方卫材医药股份有限公司主要从事透皮产品、医用胶布胶带及绷带、运动保护产品、急救包、护理产品等产品的研发、生产和销售。目前已形成创可贴、贴...<\/p>\n    <\/dt>\n    <dd><i>金股晚间版<\/i><span>02-28 17:57<\/span><\/dd>\n  <\/dl>\n<\/div>');
            $("#bottom_load").data("value", true);
        }
        '''
        append_datas = eval(re.findall(r"append\((.*?)\);", more_page)[0])
        doc = html.fromstring(append_datas)
        news_list = doc.xpath(".//div[@class='slide']")
        items = []
        for news in news_list:
            item = {}
            title = news.xpath(".//a[@class='f20']/text()")[0].strip()
            item['title'] = title
            link = news.xpath(".//a[@class='f20']/@href")[0]
            item['link'] = link
            # 根据规律从 link 中获取当前的年份
            _year = None
            try:
                _year = re.findall(r"news/(\d+)/", link)[0][:4]  # 20161218
            except:
                pass
            pub_date = news.xpath(".//span/text()")[0].strip()
            pub_date = self._process_pub_dt(pub_date, _year)
            item['pub_date'] = pub_date
            detail_resp = self.get(link)
            if detail_resp:
                detail_page = detail_resp.text
                article = self._parse_detail(detail_page)
                item['article'] = article
                items.append(item)
                # print(item)
        return items

    def start(self):
        self._spider_init()
        self._create_table()
        index_resp = self.get(self.index_url)
        if index_resp and index_resp.status_code == 200:
            index_page = index_resp.text
            index_items = self._parse_index(index_page)
            page_save_num = self._batch_save(self.spider_client, index_items,
                                             self.table_name, self.fields)
            logger.info(f"首页入库的个数是 {page_save_num}")

        for num in range(1, self.max_page + 1):
            more_url = self.more_url.format(num)
            more_resp = self.get(more_url)
            if more_resp and more_resp.status_code == 200:
                more_page = more_resp.text
                items = self._parse_more(more_page)
                page_save_num = self._batch_save(self.spider_client, items,
                                                 self.table_name, self.fields)
                logger.info(f"当前页 {num} 入库的个数是 {page_save_num}")