def extract_html(self, response): tools = CustonTools() tags = [] item = response.meta["item"] dia = response.xpath('//div[@class="article__date"]/text()').get( ).split(' ')[0].replace(' ', '').replace('\n', ' ').replace('\r', '') item['descricao'] = response.xpath( '//div[@class="article__subtitle"]/text()').get().replace( '\n', ' ').replace('\r', '') formattedData = tools.format_dia(dia) self.limit_time = tools.compare_dates(formattedData) item['dia'] = formattedData wordList = tools.get_key_word_list() html = response.xpath( '//div[@class="article__content-container protected-content"]' ).get() if not html: print('pegou main') html = response.css('main').get() for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) if isWordInHtml is None: pass else: tags.append(word) # print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) # print('tags adicionadas ' + word) if not item['tags'] is None: # print('vai limpar o html') try: html = tools.clean_html_class_oglobo(html) item['noticia'] = tools.cleanHTML(html) print(self.name_crawl + 'NOVA ' + item['titulo']) if not self.databaseController.make_request('inserir', item): print(self.name_crawl + 'Erro ao salvar no banco de dados') # print('armazenou noticia ' + item['titulo']) # print('chegou ao final do extract html') except Exception as ex: print('erro na noticia: ' + item['link']) print(ex) pass else: # print(self.name_crawl + 'Noticia não possui tags ' + item['link']) pass yield item
def extract_html(self, response): tools = CustonTools() tags = [] item = response.meta["item"] formatted_date = tools.format_dia(item['dia']) item['dia'] = formatted_date self.limit_time = tools.compare_dates(formatted_date) wordList = tools.get_key_word_list() item['descricao'] = response.xpath( '//div[@class="noticias-single__description visible-lg"]/text()' ).get() html = response.xpath('//div[@class="noticias-single__content"]').get() for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) isWordInTitulo = tools.check_word_in_html(word)(item['titulo']) if isWordInHtml is None and isWordInTitulo is None: pass else: tags.append(word) # print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) # print('tags adicionadas ' + word) if not item['tags'] is None: # print('vai limpar o html') try: item['noticia'] = tools.cleanHTML(html) # print('limpou noticia') print(self.name_crawl + 'NOVA ' + item['titulo']) # print('vai salvar no banco') if not self.databaseController.make_request('inserir', item): print(self.name_crawl + 'Erro ao salvar no banco de dados') # print('chegou ao final do extract html') except Exception as err: # print('erro na noticia: ' + item['link']) # print(err) pass else: # print('Noticia não possui tags ' + item['link']) pass yield item
def extract_html(self, response): tools = CustonTools() tags = [] item = response.meta["item"] formatted_data = tools.format_dia(item['dia']) self.limit_time = tools.compare_dates(formatted_data) item['dia'] = formatted_data word_list = tools.get_key_word_list() try: html = response.xpath( '//div[@class="txt-serif js-article-box article-box article-box-capitalize mt-15"]' ).get() except: print('falhou ao obter html' + item['link']) for word in word_list: isWordInHtml = tools.check_word_in_html(word)(html) isWordInTitulo = tools.check_word_in_html(word)(item['titulo']) if isWordInHtml is None and isWordInTitulo is None: pass else: tags.append(word) # print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) # print('tags adicionadas ' + word) if not item['tags'] is None: # print('vai limpar o html') try: item['noticia'] = tools.cleanHTML(html) # print('limpou noticia') # print('vai salvar no banco') if not self.databaseController.make_request('inserir', item): print('Erro ao salvar no banco de dados') # print('chegou ao final do extract html') except Exception as ex: print('erro na noticia: ' + item['link']) print(ex) pass else: # print('Noticia não possui tags ' + item['link']) pass return item
def extract_html(self, item): self.driver2.get(item['link']) tools = CustonTools() wordList = tools.get_key_word_list() tags = [] formatedData = tools.format_dia(item['dia']) item['dia'] = formatedData try: print() print() print(item['link']) #html = self.driver2.find_element_by_xpath("//div[@class='article-content sa_incontent']").get_attribute('innerHTML') time.sleep(2) html = self.driver2.find_element_by_xpath("//div[@class='article-content sa_incontent']").get_attribute('innerHTML') except: print('falhou') return wordList = tools.get_key_word_list() tags = [] for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) if isWordInHtml == None: pass else: tags.append(word) #print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) #print('tags adicionadas ' + word) if not item['tags'] == None: #print('vai limpar o html') try: item['noticia'] = tools.cleanHTML(html) #print(item) #print('limpou noticia') #print('vai salvar no banco') self.save_to_database_novas(item) #print('chegou ao final do extract html') except: #print('erro na noticia: ' + item['link']) pass else: print('Noticia não possui tags ' + item['link']) return item
def parse_page(self, response): databaseController = ApiRequest() item = response.meta["item"] data_request = {'link': item['link']} is_inDatabase = databaseController.make_request('check_exist_database', data_request) if is_inDatabase: return tools = CustonTools() wordList = tools.get_key_word_list() tags = [] formattedData = tools.format_dia(item['dia']) item['dia'] = formattedData self.limit_time = tools.compare_dates(formattedData) if self.limit_time: return print(self.name_crawl + 'Finalizou busca dentro do periodo de 1 mes') html = response.css('div.item-page').get() for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) if isWordInHtml is None: pass else: tags.append(word) item['noticia'] = tools.cleanHTML(html) item['tags'] = ','.join(str(tag) for tag in tags) print(self.name_crawl + 'NOVA ' + item['titulo']) if not databaseController.make_request('inserir', item): print(self.name_crawl + 'Erro ao salvar no banco de dados')
def extract_html(self, item): tools = CustonTools() self.driver2.get(item['link']) # print(item['link']) try: time = self.driver2.find_element_by_tag_name('time').text item['dia'] = tools.format_dia(time.split(' ')[0]) self.limit_time = tools.compare_dates(item['dia']) if self.limit_time: print(self.name_crawl + 'Noticia passou da data limite: ' + item['link']) return except Exception as ex: item['dia'] = 'error_time' print(ex) pass wordList = tools.get_key_word_list() tags = [] # print(item['dia']) try: html = self.driver2.find_element_by_tag_name( 'article').get_attribute('innerHTML') except Exception as ex: print(ex) return # print('pegou html') for word in wordList: isWordInHtml = tools.check_word_in_html(word)(html) if isWordInHtml is None: pass else: tags.append(word) # print('contem tag') item['tags'] = ','.join(str(tag) for tag in tags) # print('tags adicionadas ' + word) if not item['tags'] is None: # print('vai limpar o html') try: item['noticia'] = tools.cleanHTML(html) # print(item) # print('limpou noticia') # print('vai salvar no banco') print(self.name_crawl + 'NOVA ' + item['titulo']) if not self.databaseController.make_request('inserir', item): print(self.name_crawl + 'Erro ao salvar no banco de dados') # print('chegou ao final do extract html') except: print(self.name_crawl + 'erro na noticia: ' + item['link']) pass else: print(self.name_crawl + 'Noticia não possui tags ' + item['link']) return item