def load_files(self, fallback_city: str, max_workers: Optional[int] = None) -> Tuple[int, int]: """Downloads and analyses the actual file for the file entries in the database. Returns the number of successful and failed files""" # This is partially bound by waiting on external resources, but mostly very cpu intensive, # so we can spawn a bunch of processes to make this a lot faster. # We need to build a list because mysql connections and process pools don't pair well. files = list( File.objects.filter( filesize__isnull=True, oparl_access_url__isnull=False).order_by("-id").values_list( "id", flat=True)) logger.info("Downloading and analysing {} files".format(len(files))) address_pipeline = AddressPipeline(create_geoextract_data()) pbar = None if sys.stdout.isatty() and not settings.TESTING: pbar = tqdm(total=len(files)) failed = 0 successful = 0 if not self.force_singlethread: # We need to close the database connections, which will be automatically reopen for # each process # See https://stackoverflow.com/a/10684672/3549270 # and https://brobin.me/blog/2017/05/mutiprocessing-in-python-django-management-commands/ db.connections.close_all() with ProcessPoolExecutor(max_workers=max_workers) as executor: for succeeded in executor.map( self.download_and_analyze_file, files, repeat(address_pipeline), repeat(fallback_city), ): if not succeeded: failed += 1 if pbar: pbar.update() else: for file in files: succeeded = self.download_and_analyze_file( file, address_pipeline, fallback_city) if not succeeded: failed += 1 else: successful += 1 if pbar: pbar.update() if pbar: pbar.close() if failed > 0: logger.error("{} files failed to download".format(failed)) return successful, failed
def test_load_file_oom(caplog): importer = MockImporter(BaseLoader({}), force_singlethread=True) with override_settings(SUBPROCESS_MAX_RAM=1 * 1024 * 1024): failed = importer.load_files_multiprocessing(AddressPipeline([]), "München", list(range(64))) assert failed == 1 assert caplog.messages == [ "File 1: Import failed du to excessive memory usage (Limit: 1048576)" ]
def handle(self, *args, **options): importer, body = self.get_importer(options) if options["ids"]: address_pipeline = AddressPipeline(create_geoextract_data()) failed = 0 for file in options["ids"]: succeeded = importer.download_and_analyze_file( file, address_pipeline, body.short_name) if not succeeded: failed += 1 if failed > 0: logger.error("{} files failed to download".format(failed)) else: importer.load_files(max_workers=options["max_workers"], fallback_city=body.short_name)