Exemplo n.º 1
0
    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log_line('请求状态不是200')
            return 'error'

        response = etree.HTML(html.text)
        self.parse_item(response)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'bjjrj'
        return news
Exemplo n.º 2
0
    def parser_data(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1
            return 'timeout', 'timeout'

        if html.status_code != 200:
            return 'error', 'error'

        response = etree.HTML(html.text)

        con_list = response.xpath(
            '//div[@class="ldContent"]/descendant-or-self::*/text()')
        content = ''.join(con_list).strip()

        date = response.xpath('//div[@class="ldDate"]/text()')[0]
        date = date.split(':')[1]
        # log('内容', content)
        return date, content
Exemplo n.º 3
0
    def get_newsinfo(self, urls):
        '''
        访问每一条新闻详情
        :param newslist: 新闻链接集合
        :return: 新闻model
        '''
        for url in urls:
            t_sleep()
            log('当前访问的URL', url)

            try:
                html = requests.get(url, timeout=3)
                html.encoding = 'utf-8'
            except Exception as e:
                log_line('访问出错')
                print(e)
                self.__class__.retry = 1

                continue

            if html.status_code != 200:
                continue

            response = etree.HTML(html.text)

            item = self.parse_item(response, html.url)
            MogoMgr().insert(item)
Exemplo n.º 4
0
    def get_newsinfo(self, url, parser_item_fuc):
        '''
        请求每一个新闻详情
        '''
        t_sleep()

        log('当前访问的URL', url)


        html = self.get_html(url)
        if html == 'timeout':
            return 'error'

        response = etree.HTML(html.text)
        log('当前访问的URL', url, html.status_code)

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        # parser_item_fuc(response)

        title, date, content = parser_item_fuc(response)
        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'pbc'
        return news
Exemplo n.º 5
0
    def get_iteminfo(self, url):
        '''
        访问每一条新闻详情
        :param itemlist: 新闻链接集合
        :return: 新闻model
        '''
        t_sleep()

        log('当前访问的URL', url)


        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            return 'timeout'

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        response = etree.HTML(html.text)
        title, date, content = self.parse_item(response)

        news = News(title=title, date=date, content=content, url=url)
        news.spider_name = 'xinhua'
        return news
Exemplo n.º 6
0
    def get_content(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            log('访问的URL出错!!!', url)
            return 'error'

        response = etree.HTML(html.text)

        con_list = response.xpath(
            '//div[@class="union"]/descendant-or-self::*/text()')
        return ''.join(con_list).strip()
Exemplo n.º 7
0
    def get_content(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()
        log('当前访问的URL', url)

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'utf-8'
        except Exception as e:
            log_line('访问出错')
            print(e)
            self.__class__.retry = 1

            return 'timeout'

        if html.status_code != 200:
            return 'error'

        response = etree.HTML(html.text)

        return self.parse_item(response)
Exemplo n.º 8
0
    def get_newsinfo(self, url):
        '''
        请求每一个新闻详情
        :param url:
        :return:
        '''
        t_sleep()

        try:
            html = requests.get(url, headers=self.get_news_header(), timeout=3)
            html.encoding = 'gbk'
        except Exception as e:
            log_line('访问出错')
            print(e)
            return 'timeout'




        response = etree.HTML(html.text)
        log('当前访问的URL', url)

        title, date, content = self.parse_item(response)
        news = News(title=title, date=date, content=content, url=url)
        return news
Exemplo n.º 9
0
    def get_html(self, url):
        '''
        :param url:
        :return:
        '''

        t_sleep()
        # log('当前访问的URL', url)

        html = requests.get(url, headers=self.get_news_header(), timeout=3)
        html.encoding = 'utf-8'

        # try:
        #     html = requests.get(url, headers=self.get_news_header(), timeout=3)
        #     html.encoding = 'utf-8'
        # except Exception as e:
        #     log_line('访问出错')
        #     print(e)
        #     self.__class__.retry = 1
        #
        #     return 'timeout'

        # if html.status_code != 200:
        #     return 'error'

        html = etree.HTML(html.text)
        items = html.xpath('//ul[@class="list"]/li')

        # log(len(items))

        for item in items:
            self.parser_item(item)