示例#1
0
    def get_content(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        response = etree.HTML(html.text)

        con_list = response.xpath(
            '//div[@class="union"]/descendant-or-self::*/text()')
        return ''.join(con_list).strip()
示例#2
0
    def parser_data(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1
            return 'timeout', 'timeout'

        if html.status_code != 200:
            return 'error', 'error'

        response = etree.HTML(html.text)

        con_list = response.xpath(
            '//div[@class="ldContent"]/descendant-or-self::*/text()')
        content = ''.join(con_list).strip()

        date = response.xpath('//div[@class="ldDate"]/text()')[0]
        date = date.split(':')[1]
        # log('内容', content)
        return date, content
示例#3
0
    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log_line('请求状态不是200')
            return 'error'

        response = etree.HTML(html.text)
        self.parse_item(response)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'bjjrj'
        return news
示例#4
0
    def get_lunbo(self):
        '''
        财经版面
        :return:
        '''
        url = 'http://www.news.cn/fortune/'
        html = requests.get(url, headers=self.get_caijing_header())
        html.encoding = 'utf-8'

        html = etree.HTML(html.text)
        urls = html.xpath('//div[@class="swiper-slide"]/a/@href')

        year = arrow.now().date().year

        news_list = []

        for url in urls:
            if str(year)  in url:
                log('需要访问的URL 轮播图', url)
                find_one = self.mgr.find_one('url', url)
                if find_one is not None:
                    log_line('该URL已经存在 无需请求')
                    log(url)
                    continue
                news = self.get_iteminfo(url)
                if news == 'timeout' or news == 'error':
                    continue
                news_list.append(news)
        return news_list
示例#5
0
    def re_send(cls):

        if cls.retry != -1 and cls.retry_flag == -1:
            log_line('部分新闻访问出错 再次进行访问')
            log('再次运行的爬虫类型', cls)
            cls.retry_flag = 1
            cls().run()
示例#6
0
    def get_content(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            return 'error'

        response = etree.HTML(html.text)

        return self.parse_item(response)
示例#7
0
    def get_iteminfo(self, url):
        '''
        访问每一条新闻详情
        :param itemlist: 新闻链接集合
        :return: 新闻model
        '''
        t_sleep()

        log('当前访问的URL', url)


        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            return 'timeout'

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        response = etree.HTML(html.text)
        title, date, content = self.parse_item(response)

        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'xinhua'
        return news
示例#8
0
    def get_itemlist(self, page='1'):
        '''
        获取新华财经 所有新闻详情
        :return: 返回新闻model
        '''

        # 新华财经  -  新闻列表
        url = 'http://qc.wa.news.cn/nodeart/list?nid=11147664&pgnum={0}&cnt=16&tp=1&orderby=1'.format(page)

        html = requests.get(url, headers=self.get_newlist_header())
        items = json.loads(html.text[1:-1])
        items = items['data']['list']

        news_list = []

        for item in items:
            # 避免重复请求
            find_one = self.mgr.find_one('url', item['LinkUrl'])
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(item['LinkUrl'])
                continue

            news = self.get_iteminfo(item['LinkUrl'])
            if news == 'timeout' or news == 'error':
                continue
            news_list.append(news)
        return news_list
示例#9
0
    def get_newsinfo(self, urls):
        '''
        访问每一条新闻详情
        :param newslist: 新闻链接集合
        :return: 新闻model
        '''
        for url in urls:
            t_sleep()
            log('当前访问的URL', url)

            try:
                html = requests.get(url, timeout=3)
                html.encoding = 'utf-8'
            except Exception as e:
                log_line('访问出错')
                print(e)
                self.__class__.retry = 1

                continue

            if html.status_code != 200:
                continue

            response = etree.HTML(html.text)

            item = self.parse_item(response, html.url)
            MogoMgr().insert(item)
示例#10
0
文件: gzjrj.py 项目: NickLeeCoder/ttj
    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'gbk'
        except Exception as e:
            log_line('访问出错')
            print(e)
            return 'timeout'




        response = etree.HTML(html.text)
        log('当前访问的URL', url)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        return news
示例#11
0
文件: gzjrj.py 项目: NickLeeCoder/ttj
    def parser_url(self, urls):
        base_url = 'http://www.gzjr.gov.cn/'
        new_urls = []
        for url in urls:
            if str(url).endswith('.pdf'):
                continue

            url = base_url + url.split('../../')[1]
            log('拼接后的url', url)
            new_urls.append(url)
        return new_urls
示例#12
0
文件: gzjrj.py 项目: NickLeeCoder/ttj
 def send_request(self, urls):
     news_list = []
     for url in urls:
         # 避免重复请求
         find_one = self.mgr.find_one('url', url)
         if find_one is not None:
             log_line('该URL已经存在 无需请求')
             log(url)
             continue
         news = self.get_newsinfo(url)
         news_list.append(news)
     return news_list
示例#13
0
文件: gzjrj.py 项目: NickLeeCoder/ttj
    def get_html(self, url):
        '''
        :param url:
        :return:
        '''
        html = requests.get(url)
        html.encoding = 'utf-8'
        html = etree.HTML(html.text)
        urls = html.xpath('//div[@class="mainContent"]/ul/li/a/@href')
        log('提取的URL', urls)

        return self.parser_url(urls)
示例#14
0
    def send_request(self, urls):

        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            date, content = self.parser_data(url)
            if content in ('error', 'timeout'):
                continue
            self.update_news(url, content, date)
示例#15
0
    def send_request(self, urls):

        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue
            content = self.get_content(url)
            if content == 'error' or content == 'timeout':
                continue
            self.update_content(url, content)
示例#16
0
    def run(self):
        log_line('CbrcSpider 启动!!!')

        urls = self.get_html(self.start_url)
        self.send_request(urls)

        for news in self.newslist:
            find_one = self.mgr.find_one('url', news.url)
            if find_one is not None:
                log_line('该URL已经存在 无需写入')
                log(news.url)
                continue
            self.mgr.insert(news)

        self.__class__().re_send()
示例#17
0
    def send_request(self, urls):
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            content = self.get_content(url)
            if content == 'timeout' or 'error':
                continue

            for news in self.newslist:
                if news.url == url:
                    news.content = content
示例#18
0
    def get_newslist(self):
        '''
        获取首页的所有新闻链接
        :return: 返回新闻链接集合
        '''

        url = 'http://www.21jingji.com/'

        html = requests.get(url)
        html.encoding = 'utf-8'

        html = etree.HTML(html.text)
        # print(type(html))
        news_list = html.xpath('//a[@class="listTit"]/@href')
        # tt = html.xpath('//a[@class="listTit"]/text()')

        log(len(news_list))

        return news_list
示例#19
0
文件: circ.py 项目: NickLeeCoder/ttj
    def run(self):

        log_line('CircSpider 启动!!!')

        for url in self.start_urls:
            self.get_html(url)
            self.send_request(self.get_newsUrls())

            # for news in self.newslist:
            #     log(news.url, news.content)
            #
            for news in self.newslist:
                find_one = self.mgr.find_one('url', news.url)
                if find_one is not None:
                    log_line('该URL已经存在 无需写入')
                    log(news.url)
                    continue
                self.mgr.insert(news)

        self.__class__().re_send()
示例#20
0
文件: pbc.py 项目: NickLeeCoder/ttj
    def get_newsinfo(self, url, parser_item_fuc):
        '''
        请求每一个新闻详情
        '''
        t_sleep()

        log('当前访问的URL', url)


        html = self.get_html(url)
        if html == 'timeout':
            return 'error'

        response = etree.HTML(html.text)
        log('当前访问的URL', url, html.status_code)

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        # parser_item_fuc(response)

        title, date, content = parser_item_fuc(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'pbc'
        return news
示例#21
0
    def get_money(self):
        '''
        金融版面
        :return:
        '''
        url = 'http://www.xinhuanet.com/money/index.htm'
        html = requests.get(url, headers=self.get_news_header())
        html.encoding = 'utf-8'

        html = etree.HTML(html.text)

        urls_all = []

        urls_1 = html.xpath('//li[@class="clearfix"]/h3/a/@href')

        # 只对新闻列表进行处理
        urls_2 = html.xpath('//li[@class="imp"]/a/@href')
        urls_3 = html.xpath('//div[@class="swiper-slide"]/a/@href')

        urls_all.extend(urls_1)
        urls_all.extend(urls_2)
        urls_all.extend(urls_3)

        # log(len(urls_all), urls_all)

        news_list = []

        for url in urls_all:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_iteminfo(url)
            if news == 'timeout' or news == 'error':
                continue
            news_list.append(news)
        return news_list
示例#22
0
文件: pbc.py 项目: NickLeeCoder/ttj
    def send_request(self, urls, parser_item_fuc):
        '''
        用于请求每一个具体的新闻链接
        :param urls:   具体新闻URL
        :param parser_item_fuc: 用于解析每一个新闻详情的函数
        :return: 返回解析好的News类型列表
        '''
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_newsinfo(url, parser_item_fuc)

            if news == 'error' or news == 'timeout':
                continue

            news_list.append(news)
        return news_list
示例#23
0
 def __init__(self, model, input_shape, train_data_path=None):
     # model为用户自定义网络模型类 其类型应为MXnet中nn.block类
     # 模型初始化
     self.__net = model
     self.input_shape = input_shape
     self.__ctx = Tools.utils.try_all_gpus()
     self.__random_init_model()
     # 本地梯度维护
     self.local_gradient = {"weight": [], "bias": []}
     self.__init_gradient_list()
     # 本地训练数据路径
     self.train_data_path = train_data_path
     # log类
     self.log = log(path_base + "\\Fed_Client\\log")
     print("-Client Data Handler初始化完成-")
     print(self.__net)
示例#24
0
    def insert(self, item):
        item = item.__dict__
        if has_keywords(item):
            item['show_sended'] = '1'
            log('含有敏感关键字')

        log('插入数据')
        try:
            self.sheet.insert(item)
        except DuplicateKeyError as e:
            log('数据重复 无需插入', item['url'])
示例#25
0
 def __init__(self, model, input_shape, train_data):
     # 网络通信类
     with open(path_base + "\\Fed_Client\\client_config.json", 'r') as f:
         json_data = json.load(f)
     self.server_addr = (json_data['server_ip'], json_data['server_port'])
     self.recv_model_savepath = json_data[
         'default_path']  # recv_model.params
     self.client_sock = socket.socket()
     # 模型处理类
     self.data_handler = Client_data_handler(model,
                                             input_shape=input_shape,
                                             train_data_path=train_data)
     # 训练模式 从Server端同步获取
     self.train_mode = ""
     self.learning_rate = None
     self.batch_size = None
     self.epoch = None
     self.__param_sync()  #同步参数
     # log类
     self.log = log(path_base + "\\Fed_Client\\log")
示例#26
0
文件: zqrb.py 项目: NickLeeCoder/ttj
    def send_request(self, urls):
        news_list = []
        for url in urls:
            # 避免重复请求
            find_one = self.mgr.find_one('url', url)
            if find_one is not None:
                log_line('该URL已经存在 无需请求')
                log(url)
                continue

            news = self.get_newsinfo(url)

            if news == 'error':
                log('访问的新闻不存在 继续访问下一个URL')
                continue
            if news == 'timeout':
                log('访问的新闻超时 暂时跳过')
                continue

            news_list.append(news)
        return news_list
示例#27
0
 def parser_url(self, url):
     log(url)
     return 'http://www.cs.com.cn' + url[1:]
示例#28
0
文件: run.py 项目: NickLeeCoder/ttj
        # szjrj.SzJrjSpider().run,
        # xinhua.XinHuaSpider().run,
        # zqrb.ZqrbSpider().run,
    ]

    start = time.time()

    threads = []
    for index, target in enumerate(targets):
        t = threading.Thread(target=target)
        threads.append(t)
        t.setDaemon(True)
        t.start()

    for t in threads:
        t.join()

    # 发送邮件
    log('准备发送邮件')
    manager = EmailManager(Setting.SERVER_USER, Setting.SERVER_PASSWORD)
    manager.send(manager.get_emails())
    log('邮件发送完成')
    cost = time.time() - start
    log('耗时', cost)
    '''
    邮件内容需要增加:
    新闻所在的网站
    新闻关键词

    '''