def _prevalidate_file(config, file_descriptor, file_path, file_type, run_id, schema, files_to_delete, logger): """Pre-validate the input file using the CSV validator.""" split_file_basename = '{0}_import_{1}_split'.format(file_type, run_id) split_files = list( split_file(file_descriptor, config.import_config.batch_size, dirname(file_path), logger, split_file_basename)) files_to_delete.extend(split_files) num_batches = len(split_files) num_validated_batches = 0 with futures.ThreadPoolExecutor(max_workers=config.multiprocessing_config. max_local_cpus) as prevalidator: logger.info( 'Pre-validating file {0} ({1} pre-validation workers)'.format( basename(file_path), config.multiprocessing_config.max_local_cpus)) tasks = [ prevalidator.submit(prevalidate_file, f, schema) for f in split_files ] for f in futures.as_completed(tasks): f.result() # will throw exception if this one was thrown in thread num_validated_batches += 1 lvl = logging.DEBUG if num_validated_batches % 50 == 0 or num_validated_batches == num_batches: lvl = logging.INFO # Only print every 50 batches at INFO so as not to spam console logger.log( lvl, 'Pre-validated {validated_batches} of {num_batches} batches'. format(validated_batches=num_validated_batches, num_batches=num_batches)) logger.info('Successfully pre-validated file: {0}'.format( basename(file_path))) return True
def _split_file(self, file_to_split): """Method which unzips the input file into split files in the work directory.""" split_file_basename = '{0}_import_{1}_split'.format( self._import_type, self.import_id) num_batches = 0 for batch_filename in split_file(file_to_split, self._batch_size, self._extract_dir, self._logger, split_file_basename): self._files_to_delete.append(batch_filename) num_batches += 1 yield batch_filename self._logger.info( 'Finished splitting input file into {num_batches} batches'.format( num_batches=num_batches)) return