Пример #1
0
def _prevalidate_file(config, file_descriptor, file_path, file_type, run_id,
                      schema, files_to_delete, logger):
    """Pre-validate the input file using the CSV validator."""
    split_file_basename = '{0}_import_{1}_split'.format(file_type, run_id)
    split_files = list(
        split_file(file_descriptor, config.import_config.batch_size,
                   dirname(file_path), logger, split_file_basename))
    files_to_delete.extend(split_files)
    num_batches = len(split_files)
    num_validated_batches = 0
    with futures.ThreadPoolExecutor(max_workers=config.multiprocessing_config.
                                    max_local_cpus) as prevalidator:
        logger.info(
            'Pre-validating file {0} ({1} pre-validation workers)'.format(
                basename(file_path),
                config.multiprocessing_config.max_local_cpus))
        tasks = [
            prevalidator.submit(prevalidate_file, f, schema)
            for f in split_files
        ]
        for f in futures.as_completed(tasks):
            f.result()  # will throw exception if this one was thrown in thread
            num_validated_batches += 1
            lvl = logging.DEBUG
            if num_validated_batches % 50 == 0 or num_validated_batches == num_batches:
                lvl = logging.INFO  # Only print every 50 batches at INFO so as not to spam console
            logger.log(
                lvl,
                'Pre-validated {validated_batches} of {num_batches} batches'.
                format(validated_batches=num_validated_batches,
                       num_batches=num_batches))
        logger.info('Successfully pre-validated file: {0}'.format(
            basename(file_path)))
        return True
Пример #2
0
    def _split_file(self, file_to_split):
        """Method which unzips the input file into split files in the work directory."""
        split_file_basename = '{0}_import_{1}_split'.format(
            self._import_type, self.import_id)
        num_batches = 0
        for batch_filename in split_file(file_to_split, self._batch_size,
                                         self._extract_dir, self._logger,
                                         split_file_basename):
            self._files_to_delete.append(batch_filename)
            num_batches += 1
            yield batch_filename

        self._logger.info(
            'Finished splitting input file into {num_batches} batches'.format(
                num_batches=num_batches))
        return