def start( self ): for index, news in enumerate( self.news_list, start=0 ): try: if news['status'] == 'pending': news_content = self.download_news( news ) if news_content: self.news_list[ index ]['status'] = 'completed' self.news.append( news_content ) log.success('[ {nid} ] Dados salvos com sucesso!'.format(nid=news['id'])) print() print() else: error_message = 'Não foi possível fazer o parse dos dados.' log.error( error_message ) self.errors.append( error_message ) self.news_list[ index ]['errors'].append( error_message ) else: log.warning('Dados já adquiridos [ {nid} ]'.format(nid=news['id'])) except Exception as error: log.error('Erro ao baixar a notícia [ {nid} ]'.format(nid=news['id'])) log.error(error) pass finally: helper.create_file( filename=self.dump_file, content=self.news, format='json', mode='w') helper.create_file( filename=self.news_json_file, content=self.news_list, format='json', mode='w')
def download(type, filename, nid, url): if type == 'image': try: response = requests.get(url, stream=True) with open(filename, 'wb') as image: shutil.copyfileobj(response.raw, image) log.success('Imagem baixada com sucesso [{url}]'.format(url=url)) return True except Exception as error: log.error(error) pass
def set_image( news, index, link ): images_file = 'data/images.json' images = helper.read_file( images_file, format='json' ) if os.path.isfile( images_file ) else [] try: images.append({ 'catalog': news['catalog'], 'notice': news['id'], 'downloaded': False, 'original_path': link, 'new_path': set_image_link( news, index, link ) }) helper.create_file(images_file, images, mode='w', format='json') log.success('Imagem adicionada para a lista de downloads [ {image_link} ]'.format(image_link=set_image_link( news, index, link ))) except Exception as error: log.error( error )
def __init__( self ): super( Images, self ).__init__() self.images_file = 'data/images.json' self.images_folder = 'data/news/' self.dump_file = 'data/news/dump.json' if os.path.isfile( self.images_file ): images = helper.read_file( self.images_file, format='json' ) for index, image in enumerate(images, start=0): try: if not image['downloaded']: path = 'data/{image_path}'.format(image_path=image['new_path'].replace('https://static.weg.net/', '')) filename = os.path.basename( path ) folder = path.split('/') folder.pop() folder = '/'.join( folder ) base_url = 'http://www.weg.net' download_url = image['original_path'] if not os.path.isdir( folder ): os.makedirs(folder, exist_ok=True) if not download_url.startswith('http'): download_url = '{base_url}/{path}'.format(base_url=base_url, path=download_url) if helper.download(type='image', filename=path, nid=index, url=download_url): images[ index ]['downloaded'] = True log.success('Imagem baixada com sucesso [ {path} ]'.format(path=path)) else: log.warning('Imagem já baixada [ {url} ]'.format(url=image['new_path'])) except Exception as error: log.error( error ) finally: helper.create_file(self.images_file, images, mode='w', format='json') else: log.error('[!] Dump de imagens não existe')
def __init__( self ): super( Data, self ).__init__() self.news_list_file = 'data/notices.list' self.news_json_file = 'data/notices.json' self.dump_file = 'data/dump.json' self.proccess = os.getpid() self.errors = [] self.news_id_length = 4 init_message = 'Iniciando processo: {proccess}'.format(proccess=self.proccess) log.success( '=' * len( init_message ) ) log.success( init_message ) log.success( '=' * len( init_message ) ) print()
def download_news( self, news ): init_crawling = '= Iniciando crawling, alvo: [ {nid} ] {link}'.format(nid=news['id'], link=os.path.basename( news['link'] )) print() log.success( '=' * len( init_crawling ) ) log.success( init_crawling ) log.success( '=' * len( init_crawling ) ) print() request = requests.get( news['link'] ) document = BeautifulSoup( request.text, 'html.parser' ) if request.status_code == 200: return parser.parse_news( news, document ) else: error_message = 'Erro ao acessar a página: Status {status_code}'.format(status_code=request.status_code) self.errors.append( error_message ) log.error( error_message )
pass finally: helper.create_file( filename=self.dump_file, content=self.news, format='json', mode='w') helper.create_file( filename=self.news_json_file, content=self.news_list, format='json', mode='w') if __name__ == '__main__': scrapper = Scrapper() try: scrapper.start() except Exception as error: print() error_message = 'Erro ao iniciar processo: {proccess}'.format(proccess=scrapper.proccess) log.error('=' * len( error_message )) log.error( error_message ) log.error(error) log.error('=' * len( error_message )) print() finally: finished_with_errors = 'Finalizado com {errors} erro{suffix}'.format(errors=len( scrapper.errors ), suffix='s' if len( scrapper.errors ) > 1 else '') finished_without_errors = 'Finalizado sem erros' if scrapper.errors: print() log.warning( '=' * len( finished_with_errors ) ) log.warning( finished_with_errors ) else: print() log.success( '=' * len( finished_without_errors ) ) log.success( finished_without_errors )
def get_content( news, content ): if not content[0]: return '' allowed_images_extension = ['.jpeg', '.jpg', '.png', '.gif', '.bmp', '.tif'] document = BeautifulSoup( content[0].encode('utf-8'), 'html.parser' ) to_remove = ['comparison', 'bgdark', 'bglight', 'default', 'clr', 'novaJanela'] link = news['link'] catalog = news['catalog'] nid = news['id'] for item in to_remove: if document.select('.{selector}'.format(selector=item)): for element in document.select('.{selector}'.format(selector=item)): index = element['class'].index( item ) del element['class'][ index ] if document.select('.center'): for center in document.select('.center'): center['class'] = 'text-center' if document.select('p'): paragraphs = document.select('p') for paragraph in paragraphs: for content in paragraph.contents: if content == '\xa0' or not content: paragraph.decompose() if document.select('table'): tables = document.select('table') tablefilename = 'logs/weg/tables.list' link = link if isinstance( link, str ) else link.attrs['href'] table_log = '[ {nid} ]: {link}\n'.format(link=link, nid=nid) for table in tables: to_remove = ['cellpadding', 'border', 'cellspacing', 'width', 'height'] responsive = document.new_tag('div') responsive['class'] = 'table-responsive' table.wrap( responsive ) table['class'].append('table table-bordered table-hover') for item in to_remove: del table[ item ] if os.path.isfile( tablefilename ): content = helper.read_file( tablefilename ) if link not in content: helper.create_file(tablefilename, table_log) else: log.warning('Tabela já adicionada para a lista [ {url} ]'.format(url=link)) else: helper.create_file(tablefilename, table_log) log.success('Log de tabelas criado.') if document.select('a'): for index, link in enumerate( document.select('a'), start=0 ): if 'href' in link.attrs: filename, file_extension = os.path.splitext( link.attrs['href'] ) if link.attrs['href'] == 'javascript:void();': link.attrs['href'] = '#{nid}'.format(nid=news['id']) link.attrs['data-prevent-default'] = 'true' if file_extension in allowed_images_extension: set_image( news, index, link.attrs['href'] ) link.attrs['href'] = set_image_link( news, index, link.attrs['href'] ) if document.select('img'): for index, image in enumerate( document.select('img'), start=0 ): filename, file_extension = os.path.splitext( image.attrs['src'] ) responsive = True if file_extension in allowed_images_extension: set_image( news, index, image.attrs['src'] ) image.attrs['src'] = set_image_link( news, index, image.attrs['src'] ) # for parent in image.parents: # if 'class' in parent.attrs: # if 'coluna6' in parent.attrs['class']: # responsive = False # if responsive: # if 'class' in image.attrs: # image.attrs['class'].append('img-responsive') # else: # image.attrs['class'] = 'img-responsive' if document.select('.coluna6'): columns = document.select('.coluna6') for column in columns: column['class'] = 'xtt-gallery pull-right' if document.select('ul'): for ul in document.select('ul'): ul['class'] = 'xtt-list-style' for li in ul.select('> li'): span = document.new_tag('span') span.string = li.contents[0] li.string = '' li.append( span ) return str( document ).strip()