示例#1
0
    def start( self ):
        for index, news in enumerate( self.news_list, start=0 ):
            try:
                if news['status'] == 'pending':
                    news_content = self.download_news( news )

                    if news_content:
                        self.news_list[ index ]['status'] = 'completed'
                        self.news.append( news_content )

                        log.success('[ {nid} ] Dados salvos com sucesso!'.format(nid=news['id']))

                        print()
                        print()
                    else:
                        error_message = 'Não foi possível fazer o parse dos dados.'
                        log.error( error_message )
                        self.errors.append( error_message )
                        self.news_list[ index ]['errors'].append( error_message )
                else:
                    log.warning('Dados já adquiridos [ {nid} ]'.format(nid=news['id']))
            except Exception as error:
                log.error('Erro ao baixar a notícia [ {nid} ]'.format(nid=news['id']))
                log.error(error)
                pass
            finally:
                helper.create_file( filename=self.dump_file, content=self.news, format='json', mode='w')
                helper.create_file( filename=self.news_json_file, content=self.news_list, format='json', mode='w')
示例#2
0
    def __init__( self ):
        super( Images, self ).__init__()

        self.images_file = 'data/images.json'
        self.images_folder = 'data/news/'
        self.dump_file     = 'data/news/dump.json'

        if os.path.isfile( self.images_file ):
            images = helper.read_file( self.images_file, format='json' )

            for index, image in enumerate(images, start=0):
                try:
                    if not image['downloaded']:
                        path = 'data/{image_path}'.format(image_path=image['new_path'].replace('https://static.weg.net/', ''))
                        filename = os.path.basename( path )
                        folder = path.split('/')
                        folder.pop()
                        folder = '/'.join( folder )
                        base_url = 'http://www.weg.net'
                        download_url = image['original_path']

                        if not os.path.isdir( folder ):
                            os.makedirs(folder, exist_ok=True)

                        if not download_url.startswith('http'):
                            download_url = '{base_url}/{path}'.format(base_url=base_url, path=download_url)

                        if helper.download(type='image', filename=path, nid=index, url=download_url):
                            images[ index ]['downloaded'] = True
                            log.success('Imagem baixada com sucesso [ {path} ]'.format(path=path))
                    else:
                        log.warning('Imagem já baixada [ {url} ]'.format(url=image['new_path']))
                except Exception as error:
                    log.error( error )
                finally:
                    helper.create_file(self.images_file, images, mode='w', format='json')
        else:
            log.error('[!] Dump de imagens não existe')
示例#3
0
                pass
            finally:
                helper.create_file( filename=self.dump_file, content=self.news, format='json', mode='w')
                helper.create_file( filename=self.news_json_file, content=self.news_list, format='json', mode='w')


if __name__ == '__main__':
    scrapper = Scrapper()

    try:
        scrapper.start()
    except Exception as error:
        print()
        error_message = 'Erro ao iniciar processo: {proccess}'.format(proccess=scrapper.proccess)
        log.error('=' * len( error_message ))
        log.error( error_message )
        log.error(error)
        log.error('=' * len( error_message ))
        print()
    finally:
        finished_with_errors = 'Finalizado com {errors} erro{suffix}'.format(errors=len( scrapper.errors ), suffix='s' if len( scrapper.errors ) > 1 else '')
        finished_without_errors = 'Finalizado sem erros'

        if scrapper.errors:
            print()
            log.warning( '=' * len( finished_with_errors ) )
            log.warning( finished_with_errors )
        else:
            print()
            log.success( '=' * len( finished_without_errors ) )
            log.success( finished_without_errors )
示例#4
0
def get_content( news, content ):
    if not content[0]: return ''

    allowed_images_extension = ['.jpeg', '.jpg', '.png', '.gif', '.bmp', '.tif']
    document = BeautifulSoup( content[0].encode('utf-8'), 'html.parser' )
    to_remove = ['comparison', 'bgdark', 'bglight', 'default', 'clr', 'novaJanela']
    link = news['link']
    catalog = news['catalog']
    nid = news['id']

    for item in to_remove:
        if document.select('.{selector}'.format(selector=item)):
            for element in document.select('.{selector}'.format(selector=item)):
                index = element['class'].index( item )
                del element['class'][ index ]

    if document.select('.center'):
        for center in document.select('.center'):
            center['class'] = 'text-center'

    if document.select('p'):
        paragraphs = document.select('p')

        for paragraph in paragraphs:
            for content in paragraph.contents:
                if content == '\xa0' or not content:
                    paragraph.decompose()

    if document.select('table'):
            tables = document.select('table')
            tablefilename = 'logs/weg/tables.list'
            link = link if isinstance( link, str ) else link.attrs['href']
            table_log = '[ {nid} ]: {link}\n'.format(link=link, nid=nid)

            for table in tables:
                to_remove = ['cellpadding', 'border', 'cellspacing', 'width', 'height']
                responsive = document.new_tag('div')
                responsive['class'] = 'table-responsive'
                table.wrap( responsive )

                table['class'].append('table table-bordered table-hover')

                for item in to_remove:
                    del table[ item ]

            if os.path.isfile( tablefilename ):
                content = helper.read_file( tablefilename )

                if link not in content:
                    helper.create_file(tablefilename, table_log)
                else:
                    log.warning('Tabela já adicionada para a lista [ {url} ]'.format(url=link))
            else:
                helper.create_file(tablefilename, table_log)
                log.success('Log de tabelas criado.')

    if document.select('a'):
        for index, link in enumerate( document.select('a'), start=0 ):
            if 'href' in link.attrs:
                filename, file_extension = os.path.splitext( link.attrs['href'] )

                if link.attrs['href'] == 'javascript:void();':
                    link.attrs['href'] = '#{nid}'.format(nid=news['id'])
                    link.attrs['data-prevent-default'] = 'true'

                if file_extension in allowed_images_extension:
                    set_image( news, index, link.attrs['href'] )
                    link.attrs['href'] = set_image_link( news, index, link.attrs['href'] )

    if document.select('img'):
        for index, image in enumerate( document.select('img'), start=0 ):
            filename, file_extension = os.path.splitext( image.attrs['src'] )
            responsive = True

            if file_extension in allowed_images_extension:
                set_image( news, index, image.attrs['src'] )
                image.attrs['src'] = set_image_link( news, index, image.attrs['src'] )

            # for parent in image.parents:
            #     if 'class' in parent.attrs:
            #         if 'coluna6' in parent.attrs['class']:
            #             responsive = False
            # if responsive:
            #     if 'class' in image.attrs:
            #         image.attrs['class'].append('img-responsive')
            #     else:
            #         image.attrs['class'] = 'img-responsive'

    if document.select('.coluna6'):
        columns = document.select('.coluna6')

        for column in columns:
            column['class'] = 'xtt-gallery pull-right'

    if document.select('ul'):
        for ul in document.select('ul'):
            ul['class'] = 'xtt-list-style'

            for li in ul.select('> li'):
                span = document.new_tag('span')
                span.string = li.contents[0]
                li.string = ''
                li.append( span )

    return str( document ).strip()