def test_from_file(self): path = os.path.join(EXAMPLES_PATH, 'BORME-S-20150924.xml') # from local file bxml = BormeXML.from_file(path) self.assertEqual(bxml.url, self.url) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, path) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) url = bxml.get_url_cve("BORME-A-2015-183-04") self.assertEqual(url, "https://www.boe.es/borme/dias/2015/09/24/pdfs/BORME-A-2015-183-04.pdf") # from remote file (https) bxml = BormeXML.from_file(self.url) self.assertEqual(bxml.url, self.url) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) # from remote file (insecure http) bxml = BormeXML.from_file(self.url, secure=False) self.assertEqual(bxml.url, self.url_insecure) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) # Exceptions self.assertRaises(IOError, BormeXML.from_file, 'invalidfile.xml')
def test_from_file(self): path = os.path.join(EXAMPLES_PATH, 'BORME-S-20150924.xml') # from local file bxml = BormeXML.from_file(path) self.assertEqual(bxml.url, self.url) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, path) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) # from remote file (https) bxml = BormeXML.from_file(self.url) self.assertEqual(bxml.url, self.url) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) # from remote file (insecure https) bxml = BormeXML.from_file(self.url, secure=False) self.assertEqual(bxml.url, self.url_insecure) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) # Exceptions self.assertRaises(IOError, BormeXML.from_file, 'invalidfile.xml')
def test_from_date(self): bxml = BormeXML.from_date(self.date) self.assertEqual(bxml.url, self.url) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) date = datetime.date(*self.date) bxml = BormeXML.from_date(date) self.assertEqual(bxml.url, self.url) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) # Exceptions self.assertRaises(BormeDoesntExistException, BormeXML.from_date, (2015, 9, 26)) l = ['BORME-A-2015-183-%s' % x for x in ['01', '03', '04', '06', '07', '08', '09', '10', '11', '12', '14', '15', '16', '22', '25', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '38', '39', '40', '41', '43', '46', '47', '49', '50', '51'] ] self.assertEqual(bxml.get_cves(SECCION.A), l)
def download_range(begin, end, directory, seccion, provincia=None): """ Downloads PDFs using threads """ next_date = begin total_downloaded = 0 while next_date and next_date <= end: path = get_borme_pdf_path(next_date, directory) xml_path = get_borme_xml_filepath(next_date, directory) logger.info('\nDownloading files from {} (sección {}) to {}\n'.format( next_date, seccion, path)) try: bxml = BormeXML.from_file(xml_path) if bxml.next_borme: logger.debug('{filename} already exists!'.format( filename=os.path.basename(xml_path))) else: logger.debug('Re-downloading {filename}'.format( filename=os.path.basename(xml_path))) bxml = BormeXML.from_date(next_date) try: os.makedirs(os.path.dirname(xml_path)) except OSError: pass bxml.save_to_file(xml_path) except IOError: logger.debug('Downloading {filename}'.format( filename=os.path.basename(xml_path))) bxml = BormeXML.from_date(next_date) try: os.makedirs(os.path.dirname(xml_path)) except OSError: pass bxml.save_to_file(xml_path) try: os.makedirs(path) except OSError: pass _, files = bxml.download_borme(path, provincia=provincia, seccion=seccion) if len(files) > 0: logger.info('Downloaded {} files from {}'.format( len(files), next_date)) total_downloaded += len(files) next_date = bxml.next_borme logger.info('\n{} total files were downloaded'.format(total_downloaded))
def check_range(begin, end, download_xml=False): """ Downloads PDFs using threads """ next_date = begin seccion = bormeparser.SECCION.A results = {'good': 0, 'notfound': 0, 'incorrect': 0} while next_date and next_date <= end: logger.info('Checking %s\n' % next_date.isoformat()) xml_path = get_borme_xml_filepath(next_date) try: bxml = BormeXML.from_file(xml_path) except FileNotFoundError: if download_xml: logger.info('Downloading %s' % os.path.basename(xml_path)) bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) # TODO: Python 2 bxml.save_to_file(xml_path) else: logger.info('XML not found: %s\n' % os.path.basename(xml_path)) logger.info('If you want to continue specify download_xml=True\n') return sizes = bxml.get_sizes(seccion) path = get_borme_pdf_path(bxml.date) for cve, size in sizes.items(): logger.debug('Checking %s... ' % cve) filename = '%s.pdf' % cve filepath = os.path.join(path, filename) if not os.path.exists(filepath): logger.warn('%s: PDF not found\n' % filepath) results['notfound'] += 1 continue if os.path.getsize(filepath) != size: results['incorrect'] += 1 logger.warn('%s: PDF size is incorrect (is %d but should be %d)\n' % (filepath, os.path.getsize(filepath), size)) continue results['good'] += 1 logger.debug('OK\n') next_date = bxml.next_borme print('\nResults:') print('\tGood: %d' % results['good']) print('\tIncorrect: %d' % results['incorrect']) print('\tNot found: %d' % results['notfound'])
def test_from_date(self): bxml = BormeXML.from_date(self.date) self.assertEqual(bxml.url, self.url) self.assertEqual( bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual( bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual( bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) date = datetime.date(*self.date) bxml = BormeXML.from_date(date) self.assertEqual(bxml.url, self.url) self.assertEqual( bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual( bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual( bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) # Exceptions self.assertRaises(BormeDoesntExistException, BormeXML.from_date, (2015, 9, 26))
def update_previous_xml(date): """ Dado una fecha, comprueba si el XML anterior es definitivo y si no lo es lo descarga de nuevo """ xml_path = get_borme_xml_filepath(date) bxml = BormeXML.from_file(xml_path) try: prev_xml_path = get_borme_xml_filepath(bxml.prev_borme) prev_bxml = BormeXML.from_file(prev_xml_path) if prev_bxml.is_final: return False os.unlink(prev_xml_path) except FileNotFoundError: pass finally: prev_bxml = BormeXML.from_date(bxml.prev_borme) prev_bxml.save_to_file(prev_xml_path) return True
def test_from_date(self): bxml = BormeXML.from_date(self.date) self.assertEqual(bxml.url, self.url) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) date = datetime.date(*self.date) bxml = BormeXML.from_date(date) self.assertEqual(bxml.url, self.url) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) # Exceptions self.assertRaises(BormeDoesntExistException, BormeXML.from_date, (2015, 9, 26))
def test_from_file(self): path = os.path.join(tempfile.gettempdir(), 'BORME-S-20150924.xml') downloaded = download_xml(self.date, path) self.assertTrue(downloaded) self.assertEqual(os.path.getsize(path), 20534) # from local file bxml = BormeXML.from_file(path) self.assertEqual(bxml.url, self.url) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, path) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) os.unlink(path) # from remote file (https) bxml = BormeXML.from_file(self.url) self.assertEqual(bxml.url, self.url) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) # from remote file (insecure https) bxml = BormeXML.from_file(self.url, secure=False) self.assertEqual(bxml.url, self.url_insecure) self.assertEqual(bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual(bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual(bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) # Exceptions self.assertRaises(FileNotFoundError, BormeXML.from_file, 'invalidfile.xml')
def check_range(begin, end, provincia, seccion, directory, download_xml): """ Downloads PDFs using threads """ next_date = begin results = {'good': 0, 'missing': 0, 'incorrect': 0} summary = [] while next_date and next_date <= end: logger.info('Checking files from {}'.format(next_date.isoformat())) xml_path = get_borme_xml_filepath(next_date, directory) logger.debug(xml_path) try: bxml = BormeXML.from_file(xml_path) except IOError: if download_xml: logger.info('Downloading {}'.format( os.path.basename(xml_path))) bxml = BormeXML.from_date(next_date) try: os.makedirs(os.path.dirname(xml_path)) except OSError: pass bxml.save_to_file(xml_path) else: logger.info('Missing XML: {}\n'.format( os.path.basename(xml_path))) logger.info('If you want to continue use --download-xml.\n') return sizes = bxml.get_sizes(seccion, provincia) path = get_borme_pdf_path(bxml.date, directory) for cve, size in sizes.items(): logger.debug('Checking {}...'.format(cve)) filename = cve + '.pdf' filepath = os.path.join(path, filename) logger.debug(filepath) if not os.path.exists(filepath): logger.debug('Missing PDF: {}\n'.format(filepath)) results['missing'] += 1 continue if os.path.getsize(filepath) != size: results['incorrect'] += 1 logger.warn( '{}: PDF size is incorrect (is {} but should be {})\n'. format(filepath, os.path.getsize(filepath), size)) summary.append(filepath) continue results['good'] += 1 logger.debug('OK\n') next_date = bxml.next_borme if len(summary) > 0: print('\nMissing files:') print('\n'.join(summary[:10])) if len(summary) > 10: print('This list is truncated. There are {} files not shown.'.format( len(summary) - 10)) print('\nResults:') print('\tGood: {}'.format(results['good'])) print('\tIncorrect: {}'.format(results['incorrect'])) print('\tMissing: {}'.format(results['missing']))
def _import_borme_download_range2(begin, end, seccion, local_only, strict=False, create_json=True): """ strict: Para en caso de error grave """ next_date = begin total_results = {'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0, 'total_anuncios': 0, 'total_bormes': 0, 'total_companies': 0, 'total_persons': 0, 'errors': 0} total_start_time = time.time() try: while next_date and next_date <= end: xml_path = get_borme_xml_filepath(next_date) try: bxml = BormeXML.from_file(xml_path) if bxml.next_borme is None: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) except FileNotFoundError: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) # Add FileHandlers logpath = os.path.join(settings.BORME_LOG_ROOT, 'imports', '%02d-%02d' % (bxml.date.year, bxml.date.month)) os.makedirs(logpath, exist_ok=True) fh1_path = os.path.join(logpath, '%02d_info.txt' % bxml.date.day) fh1 = logging.FileHandler(fh1_path) fh1.setLevel(logging.INFO) logger.addHandler(fh1) fh2_path = os.path.join(logpath, '%02d_error.txt' % bxml.date.day) fh2 = logging.FileHandler(fh2_path) fh2.setLevel(logging.WARNING) logger.addHandler(fh2) json_path = get_borme_json_path(bxml.date) pdf_path = get_borme_pdf_path(bxml.date) os.makedirs(pdf_path, exist_ok=True) logger.info('============================================================') logger.info('Ran import_borme_download at %s' % timezone.now()) logger.info(' Import date: %s. Section: %s' % (bxml.date.isoformat(), seccion)) logger.info('============================================================') print('\nPATH: %s\nDATE: %s\nSECCION: %s\n' % (pdf_path, bxml.date, seccion)) bormes = [] if not local_only: _, files = bxml.download_borme(pdf_path, seccion=seccion) for filepath in files: if filepath.endswith('-99.pdf'): continue logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.parse(filepath, seccion)) except Exception as e: logger.error('[X] Error grave en bormeparser.parse(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger.error('[X] Una vez arreglado, reanuda la importación:') logger.error('[X] python manage.py importbormetoday local') return False, total_results else: cves = bxml.get_cves(bormeparser.SECCION.A) files_json = list(map(lambda x: os.path.join(json_path, '%s.json' % x), cves)) files_pdf = list(map(lambda x: os.path.join(pdf_path, '%s.pdf' % x), cves)) if files_exist(files_json): for filepath in files_json: logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.Borme.from_json(filepath)) except Exception as e: logger.error('[X] Error grave en bormeparser.Borme.from_json(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger.error('[X] Una vez arreglado, reanuda la importación:') logger.error('[X] python manage.py importbormetoday local') # TODO: --from date return False, total_results elif files_exist(files_pdf): for filepath in files_pdf: logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.parse(filepath, seccion)) except Exception as e: logger.error('[X] Error grave en bormeparser.parse(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger.error('[X] Una vez arreglado, reanuda la importación:') logger.error('[X] python manage.py importbormetoday local') # TODO: --from date return False, total_results else: logger.error('[X] Faltan archivos PDF y JSON que no se desea descargar.') logger.error('[X] JSON: %s' % ' '.join(files_json)) logger.error('[X] PDF: %s' % ' '.join(files_pdf)) if strict: return False, total_results for filepath in files_json: if not os.path.exists(filepath): logger.warn('[X] Missing JSON: %s' % filepath) continue logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.Borme.from_json(filepath)) except Exception as e: logger.error('[X] Error grave en bormeparser.Borme.from_json(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) for borme in sorted(bormes): total_results['total_anuncios'] += len(borme.get_anuncios()) start_time = time.time() try: results = _import1(borme) except Exception as e: logger.error('[%s] Error grave en _import1:' % borme.cve) logger.error('[%s] %s' % (borme.cve, e)) logger.error('[%s] Prueba importar manualmente en modo detallado para ver el error:' % borme.cve) logger.error('[%s] python manage.py importbormepdf %s -v 3' % (borme.cve, borme.filename)) if strict: logger.error('[%s] Una vez arreglado, reanuda la importación:' % borme.cve) logger.error('[%s] python manage.py importbormetoday local' % borme.cve) return False, total_results if create_json: os.makedirs(json_path, exist_ok=True) json_filepath = os.path.join(json_path, '%s.json' % borme.cve) borme.to_json(json_filepath) total_results['created_anuncios'] += results['created_anuncios'] total_results['created_bormes'] += results['created_bormes'] total_results['created_companies'] += results['created_companies'] total_results['created_persons'] += results['created_persons'] total_results['total_companies'] += results['total_companies'] total_results['total_persons'] += results['total_persons'] total_results['errors'] += results['errors'] if not all(map(lambda x: x == 0, total_results.values())): print_results(results, borme) elapsed_time = time.time() - start_time logger.info('[%s] Elapsed time: %.2f seconds' % (borme.cve, elapsed_time)) # Remove handlers logger.removeHandler(fh1) logger.removeHandler(fh2) next_date = bxml.next_borme except KeyboardInterrupt: logger.info('\nImport aborted.') elapsed_time = time.time() - total_start_time logger.info('\nBORMEs creados: %d/%d' % (total_results['created_bormes'], total_results['total_bormes'])) logger.info('Anuncios creados: %d/%d' % (total_results['created_anuncios'], total_results['total_anuncios'])) logger.info('Empresas creadas: %d/%d' % (total_results['created_companies'], total_results['total_companies'])) logger.info('Personas creadas: %d/%d' % (total_results['created_persons'], total_results['total_persons'])) logger.info('Total elapsed time: %.2f seconds' % elapsed_time) return True, total_results
def setUpClass(cls): path = os.path.join(EXAMPLES_PATH, 'BORME-S-20150924.xml') cls.bxml = BormeXML.from_file(path)
def _import_borme_download_range(begin, end, seccion, local_only, strict=False, create_json=True): """Importa los BORMEs data un rango de fechas. Itera en el rango de fechas. Por cada día: * Genera los nombres de los archivos BORMEs a partir del archivo BORME-XML * Carga los archivos BORME-JSON, o los BORME-PDF si no existieran los JSON * Importa en la BD los datos de los BORME :param begin: Fecha desde la que importar :param end: Fecha hasta la que importar :param seccion: Seccion del BORME :param local_only: No descarga archivos, solo procesa archivos ya presentes :param strict: Aborta el proceso tan pronto como se encuentre un error :param create_json: Crear archivo BORME-JSON :type date_from: datetime.date :type date_to: datetime.date :type seccion: bormeparser.SECCION :type local_only: bool :type strict: bool :type create_json: bool :rtype: (bool, dict) """ next_date = begin total_results = { 'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0, 'total_anuncios': 0, 'total_bormes': 0, 'total_companies': 0, 'total_persons': 0, 'errors': 0 } total_start_time = time.time() try: while next_date and next_date <= end: xml_path = get_borme_xml_filepath(next_date) try: bxml = BormeXML.from_file(xml_path) if bxml.next_borme is None: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) except OSError: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) # Add FileHandlers directory = '%02d-%02d' % (bxml.date.year, bxml.date.month) logpath = os.path.join(settings.BORME_LOG_ROOT, 'imports', directory) os.makedirs(logpath, exist_ok=True) fh1_path = os.path.join(logpath, '%02d_info.txt' % bxml.date.day) fh1 = logging.FileHandler(fh1_path) fh1.setLevel(logging.INFO) logger.addHandler(fh1) fh2_path = os.path.join(logpath, '%02d_error.txt' % bxml.date.day) fh2 = logging.FileHandler(fh2_path) fh2.setLevel(logging.WARNING) logger.addHandler(fh2) json_path = get_borme_json_path(bxml.date) pdf_path = get_borme_pdf_path(bxml.date) os.makedirs(pdf_path, exist_ok=True) logger.info( "===================================================\n" "Ran import_borme_download at {now}\n" " Import date: {borme_date}. Section: {section}\n" "===================================================" .format(now=timezone.now(), section=seccion, borme_date=bxml.date.isoformat())) print("\nPATH: {}" "\nDATE: {}" "\nSECCION: {}\n" .format(pdf_path, bxml.date, seccion)) bormes = [] if not local_only: _, files = bxml.download_borme(pdf_path, seccion=seccion) for filepath in files: if filepath.endswith('-99.pdf'): continue logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.parse(filepath, seccion)) except Exception as e: logger.error('[X] Error grave (I) en bormeparser.parse(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger_resume_import() return False, total_results else: files_json, files_pdf = _generate_borme_files_list(bxml, json_path, pdf_path) if files_exist(files_json): bormes, err = _load_and_append(files_json, strict) total_results["total_bormes"] += len(files_json) if err: return False, total_results elif files_exist(files_pdf): bormes, err = _load_and_append(files_pdf, strict, seccion) total_results["total_bormes"] += len(files_pdf) if err: return False, total_results else: logger.error('[X] Faltan archivos PDF y JSON que no se desea descargar.') logger.error('[X] JSON: %s' % ' '.join(files_json)) logger.error('[X] PDF: %s' % ' '.join(files_pdf)) if strict: return False, total_results bormes, err = _load_and_append(files_pdf, strict, seccion) total_results["total_bormes"] += len(files_pdf) for borme in sorted(bormes): total_results['total_anuncios'] += len(borme.get_anuncios()) start_time = time.time() try: results = _from_instance(borme) except Exception as e: logger.error('[%s] Error grave en _from_instance:' % borme.cve) logger.error('[%s] %s' % (borme.cve, e)) logger.error('[%s] Prueba importar manualmente en modo detallado para ver el error:' % borme.cve) logger.error('[%s] python manage.py importbormepdf %s -v 3' % (borme.cve, borme.filename)) if strict: logger_resume_import(cve=borme.cve) return False, total_results if create_json: os.makedirs(json_path, exist_ok=True) json_filepath = os.path.join(json_path, '%s.json' % borme.cve) borme.to_json(json_filepath) for key in total_results.keys(): total_results[key] += results[key] if not all(map(lambda x: x == 0, total_results.values())): _print_results(results, borme) elapsed_time = time.time() - start_time logger.info('[%s] Elapsed time: %.2f seconds' % (borme.cve, elapsed_time)) # Remove handlers logger.removeHandler(fh1) logger.removeHandler(fh2) next_date = bxml.next_borme except KeyboardInterrupt: logger.info('\nImport aborted.') elapsed_time = time.time() - total_start_time logger.info("\nBORMEs creados: {created_bormes}/{total_bormes}\n" "Anuncios creados: {created_anuncios}/{total_anuncios}\n" "Empresas creadas: {created_companies}/{total_companies}\n" "Personas creadas: {created_persons}/{total_persons}" .format(**total_results)) logger.info("Total elapsed time: %.2f seconds" % elapsed_time) return True, total_results
def _import_borme_download_range2(begin, end, seccion, local_only, strict=False, create_json=True): """ strict: Para en caso de error grave """ next_date = begin total_results = {'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0, 'total_anuncios': 0, 'total_bormes': 0, 'total_companies': 0, 'total_persons': 0, 'errors': 0} total_start_time = time.time() try: while next_date and next_date <= end: xml_path = get_borme_xml_filepath(next_date) try: bxml = BormeXML.from_file(xml_path) if bxml.next_borme is None: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) except FileNotFoundError: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) # Add FileHandlers logpath = os.path.join(settings.BORME_LOG_ROOT, 'imports', '%02d-%02d' % (bxml.date.year, bxml.date.month)) os.makedirs(logpath, exist_ok=True) fh1_path = os.path.join(logpath, '%02d_info.txt' % bxml.date.day) fh1 = logging.FileHandler(fh1_path) fh1.setLevel(logging.INFO) logger.addHandler(fh1) fh2_path = os.path.join(logpath, '%02d_error.txt' % bxml.date.day) fh2 = logging.FileHandler(fh2_path) fh2.setLevel(logging.WARNING) logger.addHandler(fh2) json_path = get_borme_json_path(bxml.date) pdf_path = get_borme_pdf_path(bxml.date) os.makedirs(pdf_path, exist_ok=True) logger.info('============================================================') logger.info('Ran import_borme_download at %s' % timezone.now()) logger.info(' Import date: %s. Section: %s' % (bxml.date.isoformat(), seccion)) logger.info('============================================================') print('\nPATH: %s\nDATE: %s\nSECCION: %s\n' % (pdf_path, bxml.date, seccion)) bormes = [] if not local_only: _, files = bxml.download_borme(pdf_path, seccion=seccion) for filepath in files: if filepath.endswith('-99.pdf'): continue logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.parse(filepath, seccion)) except Exception as e: logger.error('[X] Error grave (I) en bormeparser.parse(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger.error('[X] Una vez arreglado, reanuda la importación:') logger.error('[X] python manage.py importbormetoday local') return False, total_results else: cves = bxml.get_cves(bormeparser.SECCION.A) files_json = list(map(lambda x: os.path.join(json_path, '%s.json' % x), cves)) files_pdf = list(map(lambda x: os.path.join(pdf_path, '%s.pdf' % x), cves)) if files_exist(files_json): for filepath in files_json: logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.Borme.from_json(filepath)) except Exception as e: logger.error('[X] Error grave (I) en bormeparser.Borme.from_json(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger.error('[X] Una vez arreglado, reanuda la importación:') logger.error('[X] python manage.py importbormetoday local') # TODO: --from date return False, total_results elif files_exist(files_pdf): for filepath in files_pdf: logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.parse(filepath, seccion)) except Exception as e: logger.error('[X] Error grave (II) en bormeparser.parse(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger.error('[X] Una vez arreglado, reanuda la importación:') logger.error('[X] python manage.py importbormetoday local') # TODO: --from date return False, total_results else: logger.error('[X] Faltan archivos PDF y JSON que no se desea descargar.') logger.error('[X] JSON: %s' % ' '.join(files_json)) logger.error('[X] PDF: %s' % ' '.join(files_pdf)) if strict: return False, total_results for filepath in files_json: if not os.path.exists(filepath): logger.warn('[X] Missing JSON: %s' % filepath) continue logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.Borme.from_json(filepath)) except Exception as e: logger.error('[X] Error grave (II) en bormeparser.Borme.from_json(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) for borme in sorted(bormes): total_results['total_anuncios'] += len(borme.get_anuncios()) start_time = time.time() try: results = _import1(borme) except Exception as e: logger.error('[%s] Error grave en _import1:' % borme.cve) logger.error('[%s] %s' % (borme.cve, e)) logger.error('[%s] Prueba importar manualmente en modo detallado para ver el error:' % borme.cve) logger.error('[%s] python manage.py importbormepdf %s -v 3' % (borme.cve, borme.filename)) if strict: logger.error('[%s] Una vez arreglado, reanuda la importación:' % borme.cve) logger.error('[%s] python manage.py importbormetoday local' % borme.cve) return False, total_results if create_json: os.makedirs(json_path, exist_ok=True) json_filepath = os.path.join(json_path, '%s.json' % borme.cve) borme.to_json(json_filepath) total_results['created_anuncios'] += results['created_anuncios'] total_results['created_bormes'] += results['created_bormes'] total_results['created_companies'] += results['created_companies'] total_results['created_persons'] += results['created_persons'] total_results['total_companies'] += results['total_companies'] total_results['total_persons'] += results['total_persons'] total_results['errors'] += results['errors'] if not all(map(lambda x: x == 0, total_results.values())): print_results(results, borme) elapsed_time = time.time() - start_time logger.info('[%s] Elapsed time: %.2f seconds' % (borme.cve, elapsed_time)) # Remove handlers logger.removeHandler(fh1) logger.removeHandler(fh2) next_date = bxml.next_borme except KeyboardInterrupt: logger.info('\nImport aborted.') elapsed_time = time.time() - total_start_time logger.info('\nBORMEs creados: %d/%d' % (total_results['created_bormes'], total_results['total_bormes'])) logger.info('Anuncios creados: %d/%d' % (total_results['created_anuncios'], total_results['total_anuncios'])) logger.info('Empresas creadas: %d/%d' % (total_results['created_companies'], total_results['total_companies'])) logger.info('Personas creadas: %d/%d' % (total_results['created_persons'], total_results['total_persons'])) logger.info('Total elapsed time: %.2f seconds' % elapsed_time) return True, total_results
def test_from_file(self): path = os.path.join(EXAMPLES_PATH, 'BORME-S-20150924.xml') # from local file bxml = BormeXML.from_file(path) self.assertEqual(bxml.url, self.url) self.assertEqual( bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, path) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual( bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual( bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) url = bxml.get_url_cve("BORME-A-2015-183-04") self.assertEqual( url, "https://www.boe.es/borme/dias/2015/09/24/pdfs/BORME-A-2015-183-04.pdf" ) # from remote file (https) bxml = BormeXML.from_file(self.url) self.assertEqual(bxml.url, self.url) self.assertEqual( bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual( bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual( bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) # from remote file (insecure http) bxml = BormeXML.from_file(self.url, secure=False) self.assertEqual(bxml.url, self.url_insecure) self.assertEqual( bxml.date, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2])) self.assertEqual(bxml.filename, None) self.assertEqual(bxml.nbo, self.nbo) self.assertEqual( bxml.prev_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] - 1)) self.assertEqual( bxml.next_borme, datetime.date(year=self.date[0], month=self.date[1], day=self.date[2] + 1)) # Exceptions self.assertRaises(IOError, BormeXML.from_file, 'invalidfile.xml')
def check_range(begin, end, provincia, seccion, directory, download_xml): """ Downloads PDFs using threads """ next_date = begin results = {'good': 0, 'missing': 0, 'incorrect': 0} summary = [] while next_date and next_date <= end: logger.info('Checking files from {}'.format(next_date.isoformat())) xml_path = get_borme_xml_filepath(next_date, directory) logger.debug(xml_path) try: bxml = BormeXML.from_file(xml_path) except IOError: if download_xml: logger.info('Downloading {}'.format(os.path.basename(xml_path))) bxml = BormeXML.from_date(next_date) try: os.makedirs(os.path.dirname(xml_path)) except OSError: pass bxml.save_to_file(xml_path) else: logger.info('Missing XML: {}\n'.format(os.path.basename(xml_path))) logger.info('If you want to continue use --download-xml.\n') return sizes = bxml.get_sizes(seccion, provincia) path = get_borme_pdf_path(bxml.date, directory) for cve, size in sizes.items(): logger.debug('Checking {}...'.format(cve)) filename = cve + '.pdf' filepath = os.path.join(path, filename) logger.debug(filepath) if not os.path.exists(filepath): logger.debug('Missing PDF: {}\n'.format(filepath)) results['missing'] += 1 continue if os.path.getsize(filepath) != size: results['incorrect'] += 1 logger.warn('{}: PDF size is incorrect (is {} but should be {})\n'.format(filepath, os.path.getsize(filepath), size)) summary.append(filepath) continue results['good'] += 1 logger.debug('OK\n') next_date = bxml.next_borme if len(summary) > 0: print('\nMissing files:') print('\n'.join(summary[:10])) if len(summary) > 10: print('This list is truncated. There are {} files not shown.'.format(len(summary) - 10)) print('\nResults:') print('\tGood: {}'.format(results['good'])) print('\tIncorrect: {}'.format(results['incorrect'])) print('\tMissing: {}'.format(results['missing']))
def _import_borme_download_range(begin, end, seccion, local_only, strict=False, create_json=True): """Importa los BORMEs data un rango de fechas. Itera en el rango de fechas. Por cada día: * Genera los nombres de los archivos BORMEs a partir del archivo BORME-XML * Carga los archivos BORME-JSON, o los BORME-PDF si no existieran los JSON * Importa en la BD los datos de los BORME :param begin: Fecha desde la que importar :param end: Fecha hasta la que importar :param seccion: Seccion del BORME :param local_only: No descarga archivos, solo procesa archivos ya presentes :param strict: Aborta el proceso tan pronto como se encuentre un error :param create_json: Crear archivo BORME-JSON :type date_from: datetime.date :type date_to: datetime.date :type seccion: bormeparser.SECCION :type local_only: bool :type strict: bool :type create_json: bool :rtype: (bool, dict) """ next_date = begin total_results = { 'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0, 'total_anuncios': 0, 'total_bormes': 0, 'total_companies': 0, 'total_persons': 0, 'errors': 0 } total_start_time = time.time() try: while next_date and next_date <= end: xml_path = get_borme_xml_filepath(next_date) try: bxml = BormeXML.from_file(xml_path) if bxml.next_borme is None: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) except OSError: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) # Add FileHandlers directory = '%02d-%02d' % (bxml.date.year, bxml.date.month) logpath = os.path.join(settings.BORME_LOG_ROOT, 'imports', directory) os.makedirs(logpath, exist_ok=True) fh1_path = os.path.join(logpath, '%02d_info.txt' % bxml.date.day) fh1 = logging.FileHandler(fh1_path) fh1.setLevel(logging.INFO) logger.addHandler(fh1) fh2_path = os.path.join(logpath, '%02d_error.txt' % bxml.date.day) fh2 = logging.FileHandler(fh2_path) fh2.setLevel(logging.WARNING) logger.addHandler(fh2) json_path = get_borme_json_path(bxml.date) pdf_path = get_borme_pdf_path(bxml.date) os.makedirs(pdf_path, exist_ok=True) logger.info( "===================================================\n" "Ran import_borme_download at {now}\n" " Import date: {borme_date}. Section: {section}\n" "===================================================".format( now=timezone.now(), section=seccion, borme_date=bxml.date.isoformat())) print("\nPATH: {}" "\nDATE: {}" "\nSECCION: {}\n".format(pdf_path, bxml.date, seccion)) bormes = [] if not local_only: _, files = bxml.download_borme(pdf_path, seccion=seccion) for filepath in files: if filepath.endswith('-99.pdf'): continue logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.parse(filepath, seccion)) except Exception as e: logger.error( '[X] Error grave (I) en bormeparser.parse(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger_resume_import() return False, total_results else: files_json, files_pdf = _generate_borme_files_list( bxml, json_path, pdf_path) if files_exist(files_json): bormes, err = _load_and_append(files_json, strict) total_results["total_bormes"] += len(files_json) if err: return False, total_results elif files_exist(files_pdf): bormes, err = _load_and_append(files_pdf, strict, seccion) total_results["total_bormes"] += len(files_pdf) if err: return False, total_results else: logger.error( '[X] Faltan archivos PDF y JSON que no se desea descargar.' ) logger.error('[X] JSON: %s' % ' '.join(files_json)) logger.error('[X] PDF: %s' % ' '.join(files_pdf)) if strict: return False, total_results bormes, err = _load_and_append(files_pdf, strict, seccion) total_results["total_bormes"] += len(files_pdf) for borme in sorted(bormes): total_results['total_anuncios'] += len(borme.get_anuncios()) start_time = time.time() try: results = _from_instance(borme) except Exception as e: logger.error('[%s] Error grave en _from_instance:' % borme.cve) logger.error('[%s] %s' % (borme.cve, e)) logger.error( '[%s] Prueba importar manualmente en modo detallado para ver el error:' % borme.cve) logger.error( '[%s] python manage.py importbormepdf %s -v 3' % (borme.cve, borme.filename)) if strict: logger_resume_import(cve=borme.cve) return False, total_results if create_json: os.makedirs(json_path, exist_ok=True) json_filepath = os.path.join(json_path, '%s.json' % borme.cve) borme.to_json(json_filepath) for key in total_results.keys(): total_results[key] += results[key] if not all(map(lambda x: x == 0, total_results.values())): _print_results(results, borme) elapsed_time = time.time() - start_time logger.info('[%s] Elapsed time: %.2f seconds' % (borme.cve, elapsed_time)) # Remove handlers logger.removeHandler(fh1) logger.removeHandler(fh2) next_date = bxml.next_borme except KeyboardInterrupt: logger.info('\nImport aborted.') elapsed_time = time.time() - total_start_time logger.info("\nBORMEs creados: {created_bormes}/{total_bormes}\n" "Anuncios creados: {created_anuncios}/{total_anuncios}\n" "Empresas creadas: {created_companies}/{total_companies}\n" "Personas creadas: {created_persons}/{total_persons}".format( **total_results)) logger.info("Total elapsed time: %.2f seconds" % elapsed_time) return True, total_results