Python CustonTools示例，joao_scrap.tools.cleanHTML.CustonTools Python示例

示例#1

0

显示文件

    def extract_html(self, response):

        tools = CustonTools()
        tags = []
        item = response.meta["item"]

        formattedData = tools.format_data_folhasp(item['dia'])

        self.limit_time = tools.compare_dates(formattedData)

        item['dia'] = formattedData

        wordList = tools.get_key_word_list()

        try:
            html = response.xpath('//div[@class="c-news__body"]').get()
        except:
            html = response.xpath('//div[@class="c-news__content"]').get()

        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)
            isWordInTitulo = tools.check_word_in_html(word)(item['titulo'])

            if isWordInHtml is None and isWordInTitulo is None:
                pass
            else:

                tags.append(word)
                # print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                # print('tags adicionadas ' + word)

        if len(item['tags']) > 0:
            # print('vai limpar o html')
            try:
                html = tools.clean_html_class_folhasp(html)
                item['noticia'] = tools.cleanHTML(html)
                # return print(item['noticia'])
                # print('limpou noticia')
                print(self.name_crawl + 'NOVA ' + item['titulo'])
                # print(item['noticia'])
                # print('salvando')
                # print(item)
                if not self.databaseController.make_request('inserir', item):
                    print('Erro ao salvar no banco de dados')

                # print('chegou ao final do extract html')
            except Exception as ex:

                # print('erro na noticia: ' + item['link'])
                # print(ex)
                pass

        else:
            print(self.name_crawl + 'Noticia não possui tags ' + item['link'])

        yield item

示例#2

0

显示文件

    def extract_html(self, response):
        tools = CustonTools()
        tags = []
        item = response.meta["item"]

        dia = response.xpath('//div[@class="article__date"]/text()').get(
        ).split(' ')[0].replace(' ', '').replace('\n', ' ').replace('\r', '')
        item['descricao'] = response.xpath(
            '//div[@class="article__subtitle"]/text()').get().replace(
                '\n', ' ').replace('\r', '')

        formattedData = tools.format_dia(dia)
        self.limit_time = tools.compare_dates(formattedData)
        item['dia'] = formattedData

        wordList = tools.get_key_word_list()

        html = response.xpath(
            '//div[@class="article__content-container protected-content"]'
        ).get()
        if not html:
            print('pegou main')
            html = response.css('main').get()

        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)

            if isWordInHtml is None:
                pass
            else:

                tags.append(word)
                # print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                # print('tags adicionadas ' + word)

        if not item['tags'] is None:
            # print('vai limpar o html')
            try:
                html = tools.clean_html_class_oglobo(html)
                item['noticia'] = tools.cleanHTML(html)

                print(self.name_crawl + 'NOVA ' + item['titulo'])
                if not self.databaseController.make_request('inserir', item):
                    print(self.name_crawl + 'Erro ao salvar no banco de dados')
                # print('armazenou noticia ' + item['titulo'])
                # print('chegou ao final do extract html')
            except Exception as ex:
                print('erro na noticia: ' + item['link'])
                print(ex)
                pass

        else:
            # print(self.name_crawl + 'Noticia não possui tags ' + item['link'])
            pass

        yield item

示例#3

0

显示文件

    def extract_html(self, response):
        tools = CustonTools()
        tags = []
        item = response.meta["item"]

        if item['dia'] is None:
            dia = response.xpath(
                '//div[@class="n--noticia__state-desc"]/p/text()').get().split(
                    '|')[0][1:]

            item['dia'] = dia

        formattedData = tools.format_data_estadao(item['dia'])
        self.limit_time = tools.compare_dates(formattedData)

        item['dia'] = formattedData

        wordList = tools.get_key_word_list()

        html = response.xpath(
            '//div[@class="n--noticia__content content"]').get()

        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)
            isWordInTitulo = tools.check_word_in_html(word)(item['titulo'])

            if isWordInHtml is None and isWordInTitulo is None:
                pass
            else:

                tags.append(word)
                # print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                # print('tags adicionadas ' + word)

        if not item['tags'] is None:
            # print('vai limpar o html')
            try:
                item['noticia'] = tools.cleanHTML(html)
                # print('limpou noticia')
                print(self.name_crawl + 'NOVA ' + item['titulo'])
                # print('vai salvar no banco')
                if not self.databaseController.make_request('inserir', item):
                    print(self.name_crawl + 'Erro ao salvar no banco de dados')

                # print('chegou ao final do extract html')
            except Exception as err:
                print('erro na noticia: ' + item['link'])
                print(err)
                pass
        else:
            print('Noticia não possui tags ' + item['link'])
        yield item

示例#4

0

显示文件

    def extract_html(self, response):
        tools = CustonTools()
        tags = []
        item = response.meta["item"]
        formatted_data = tools.format_dia(item['dia'])
        self.limit_time = tools.compare_dates(formatted_data)

        item['dia'] = formatted_data
        word_list = tools.get_key_word_list()
        try:
            html = response.xpath(
                '//div[@class="txt-serif js-article-box article-box article-box-capitalize mt-15"]'
            ).get()
        except:
            print('falhou ao obter html' + item['link'])

        for word in word_list:

            isWordInHtml = tools.check_word_in_html(word)(html)

            isWordInTitulo = tools.check_word_in_html(word)(item['titulo'])

            if isWordInHtml is None and isWordInTitulo is None:
                pass
            else:

                tags.append(word)
                # print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                # print('tags adicionadas ' + word)

        if not item['tags'] is None:
            # print('vai limpar o html')
            try:
                item['noticia'] = tools.cleanHTML(html)
                # print('limpou noticia')
                # print('vai salvar no banco')
                if not self.databaseController.make_request('inserir', item):
                    print('Erro ao salvar no banco de dados')
                # print('chegou ao final do extract html')
            except Exception as ex:
                print('erro na noticia: ' + item['link'])
                print(ex)
                pass
        else:
            # print('Noticia não possui tags ' + item['link'])
            pass
        return item

示例#5

0

显示文件

文件： oglobo.py 项目： itsjhonny/webcrawller_covid19_news

    def extract_html(self, response):
        tools = CustonTools()
        tags = []
        item = response.meta["item"]

        formatedData = tools.format_data_oglobo(item['dia'])

        item['dia'] = formatedData

        wordList = tools.get_key_word_list()

        html = response.xpath(
            '//div[@class="article__content-container protected-content"]'
        ).get()
        if (not html):
            print('pegou main')
            html = response.css('main').get()

        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)

            if isWordInHtml == None:
                pass
            else:

                tags.append(word)
                #print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                #print('tags adicionadas ' + word)

        if not item['tags'] == None:
            #print('vai limpar o html')
            try:
                html = tools.clean_html_class_oglobo(html)
                item['noticia'] = tools.cleanHTML(html)

                #self.save_to_database(item)
                #print('armazenou noticia ' + item['titulo'])
                #print('chegou ao final do extract html')
            except:

                print('erro na noticia: ' + item['link'])
                pass

        else:
            print('Noticia não possui tags ' + item['link'])

        yield item

示例#6

0

显示文件

文件： gauchazh.py 项目： itsjhonny/webcrawller_covid19_news

    def extract_html(self, item):
        self.driver2.get(item['link'])
        tools = CustonTools()
        wordList = tools.get_key_word_list()
        tags = []
        formatedData = tools.format_dia(item['dia'])
        item['dia'] = formatedData
        try:
            print()
            print()
            print(item['link'])
            #html = self.driver2.find_element_by_xpath("//div[@class='article-content sa_incontent']").get_attribute('innerHTML')
            time.sleep(2)
            html = self.driver2.find_element_by_xpath("//div[@class='article-content sa_incontent']").get_attribute('innerHTML')
        except:
            print('falhou')
            return

        wordList = tools.get_key_word_list()
        tags = []
        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)

            if isWordInHtml == None:
                pass
            else:

                tags.append(word)
                #print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                #print('tags adicionadas ' + word)

        if not item['tags'] == None:
            #print('vai limpar o html')
            try:
                item['noticia'] = tools.cleanHTML(html)
                #print(item)
                #print('limpou noticia')
                #print('vai salvar no banco')
                self.save_to_database_novas(item)
                #print('chegou ao final do extract html')
            except:
                #print('erro na noticia: ' + item['link'])
                pass
        else:
            print('Noticia não possui tags ' + item['link'])
        return item

示例#7

0

显示文件

    def parse_page(self, response):

        databaseController = ApiRequest()
        item = response.meta["item"]

        data_request = {'link': item['link']}
        is_inDatabase = databaseController.make_request('check_exist_database', data_request)
        if is_inDatabase:
            return

        tools = CustonTools()
        wordList = tools.get_key_word_list()
        tags = []
        formattedData = tools.format_dia(item['dia'])
        item['dia'] = formattedData

        self.limit_time = tools.compare_dates(formattedData)

        if self.limit_time:
            return print(self.name_crawl + 'Finalizou busca dentro do periodo de 1 mes')

        html = response.css('div.item-page').get()
        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)

            if isWordInHtml is None:
                pass
            else:
                tags.append(word)
                item['noticia'] = tools.cleanHTML(html)
                item['tags'] = ','.join(str(tag) for tag in tags)

        print(self.name_crawl + 'NOVA ' + item['titulo'])
        if not databaseController.make_request('inserir', item):
            print(self.name_crawl + 'Erro ao salvar no banco de dados')

示例#8

0

显示文件

文件： g1_spider.py 项目： itsjhonny/webcrawller_covid19_news

    def extract_html(self, item):
        tools = CustonTools()
        self.driver2.get(item['link'])
        # print(item['link'])
        try:
            time = self.driver2.find_element_by_tag_name('time').text
            item['dia'] = tools.format_dia(time.split(' ')[0])
            self.limit_time = tools.compare_dates(item['dia'])

            if self.limit_time:

                print(self.name_crawl + 'Noticia passou da data limite: ' +
                      item['link'])
                return

        except Exception as ex:
            item['dia'] = 'error_time'
            print(ex)
            pass

        wordList = tools.get_key_word_list()
        tags = []
        # print(item['dia'])

        try:
            html = self.driver2.find_element_by_tag_name(
                'article').get_attribute('innerHTML')
        except Exception as ex:
            print(ex)
            return

        # print('pegou html')

        for word in wordList:

            isWordInHtml = tools.check_word_in_html(word)(html)

            if isWordInHtml is None:
                pass
            else:

                tags.append(word)
                # print('contem tag')
                item['tags'] = ','.join(str(tag) for tag in tags)
                # print('tags adicionadas ' + word)

        if not item['tags'] is None:
            # print('vai limpar o html')
            try:
                item['noticia'] = tools.cleanHTML(html)
                # print(item)
                # print('limpou noticia')
                # print('vai salvar no banco')
                print(self.name_crawl + 'NOVA ' + item['titulo'])

                if not self.databaseController.make_request('inserir', item):
                    print(self.name_crawl + 'Erro ao salvar no banco de dados')

                # print('chegou ao final do extract html')
            except:
                print(self.name_crawl + 'erro na noticia: ' + item['link'])
                pass
        else:
            print(self.name_crawl + 'Noticia não possui tags ' + item['link'])

        return item