def test_save_file_description():

    with NamedTemporaryFile() as f:
        working_dir, file_name = os.path.split(f.name)
        file_path = save_file_description(working_dir, file_name,
                                          "this is a test")
        assert file_path == f.name
        assert f.read().decode("utf-8") == "this is a test"

    with NamedTemporaryFile() as f:
        working_dir, file_name = os.path.split(f.name)
        file_path = save_file_description(working_dir, file_name,
                                          "this\ris\na\r\ntest")
        assert file_path == f.name
        assert f.read().decode("utf-8") == "this\r\nis\r\na\r\ntest"
    def finalize_zip_contents(self):
        self.filepaths_to_delete.append(self.working_dir_path /
                                        "Data_Dictionary_Crosswalk.xlsx")

        add_data_dictionary_to_zip(str(self.zip_file_path.parent),
                                   str(self.zip_file_path))

        file_description = build_file_description(str(self.readme_path),
                                                  dict())
        file_description_path = save_file_description(
            str(self.zip_file_path.parent), self.readme_path.name,
            file_description)
        self.filepaths_to_delete.append(Path(file_description_path))
        append_files_to_zip_file([file_description_path],
                                 str(self.zip_file_path))
        self.total_download_size = self.zip_file_path.stat().st_size
Exemplo n.º 3
0
def generate_download(download_job: DownloadJob,
                      origination: Optional[str] = None):
    """Create data archive files from the download job object"""

    # Parse data from download_job
    json_request = json.loads(download_job.json_request)
    columns = json_request.get("columns", None)
    limit = json_request.get("limit", None)
    piid = json_request.get("piid", None)
    award_id = json_request.get("award_id")
    assistance_id = json_request.get("assistance_id")
    file_format = json_request.get("file_format")

    file_name = start_download(download_job)
    working_dir = None
    try:
        # Create temporary files and working directory
        zip_file_path = settings.CSV_LOCAL_PATH + file_name
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            # Clean up a zip file that might exist from a prior attempt at this download
            os.remove(zip_file_path)
        working_dir = os.path.splitext(zip_file_path)[0]
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)

        write_to_log(message=f"Generating {file_name}",
                     download_job=download_job)

        # Generate sources from the JSON request object
        sources = get_download_sources(json_request, origination)
        for source in sources:
            # Parse and write data to the file; if there are no matching columns for a source then add an empty file
            source_column_count = len(source.columns(columns))
            if source_column_count == 0:
                create_empty_data_file(source, download_job, working_dir, piid,
                                       assistance_id, zip_file_path,
                                       file_format)
            else:
                download_job.number_of_columns += source_column_count
                parse_source(source, columns, download_job, working_dir, piid,
                             assistance_id, zip_file_path, limit, file_format)
        include_data_dictionary = json_request.get("include_data_dictionary")
        if include_data_dictionary:
            add_data_dictionary_to_zip(working_dir, zip_file_path)
        include_file_description = json_request.get("include_file_description")
        if include_file_description:
            write_to_log(message="Adding file description to zip file")
            file_description = build_file_description(
                include_file_description["source"], sources)
            file_description = file_description.replace(
                "[AWARD_ID]", str(award_id))
            file_description_path = save_file_description(
                working_dir, include_file_description["destination"],
                file_description)
            append_files_to_zip_file([file_description_path], zip_file_path)
        download_job.file_size = os.stat(zip_file_path).st_size
    except InvalidParameterException as e:
        exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise InvalidParameterException(e)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise Exception(download_job.error_message) from e
    finally:
        # Remove working directory
        if working_dir and os.path.exists(working_dir):
            shutil.rmtree(working_dir)
        _kill_spawned_processes(download_job)

    try:
        # push file to S3 bucket, if not local
        if not settings.IS_LOCAL:
            bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
            region = settings.USASPENDING_AWS_REGION
            start_uploading = time.perf_counter()
            multipart_upload(bucket, region, zip_file_path,
                             os.path.basename(zip_file_path))
            write_to_log(
                message=
                f"Uploading took {time.perf_counter() - start_uploading:.2f}s",
                download_job=download_job)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to upload the file"
        fail_download(download_job, e, exc_msg)
        if isinstance(e, InvalidParameterException):
            raise InvalidParameterException(e)
        else:
            raise Exception(download_job.error_message) from e
    finally:
        # Remove generated file
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            os.remove(zip_file_path)
        _kill_spawned_processes(download_job)

    return finish_download(download_job)
def generate_csvs(download_job):
    """Derive the relevant file location and write CSVs to it"""

    # Parse data from download_job
    json_request = json.loads(download_job.json_request)
    columns = json_request.get('columns', None)
    limit = json_request.get('limit', None)
    piid = json_request.get('piid', None)

    file_name = start_download(download_job)
    try:
        # Create temporary files and working directory
        zip_file_path = settings.CSV_LOCAL_PATH + file_name
        working_dir = os.path.splitext(zip_file_path)[0]
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)

        write_to_log(message='Generating {}'.format(file_name), download_job=download_job)

        # Generate sources from the JSON request object
        sources = get_csv_sources(json_request)
        for source in sources:
            # Parse and write data to the file
            download_job.number_of_columns = max(download_job.number_of_columns, len(source.columns(columns)))
            parse_source(source, columns, download_job, working_dir, piid, zip_file_path, limit)
        include_file_description = json_request.get('include_file_description')
        if include_file_description:
            write_to_log(message="Adding file description to zip file")
            file_description = build_file_description(include_file_description["source"], sources)
            file_description_path = save_file_description(
                working_dir, include_file_description["destination"], file_description)
            append_files_to_zip_file([file_description_path], zip_file_path)
        download_job.file_size = os.stat(zip_file_path).st_size
    except InvalidParameterException as e:
        exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise InvalidParameterException(e)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise Exception(download_job.error_message) from e
    finally:
        # Remove working directory
        if os.path.exists(working_dir):
            shutil.rmtree(working_dir)

    try:
        # push file to S3 bucket, if not local
        if not settings.IS_LOCAL:
            bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
            region = settings.USASPENDING_AWS_REGION
            start_uploading = time.perf_counter()
            multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path))
            write_to_log(message='Uploading took {} seconds'.format(time.perf_counter() - start_uploading),
                         download_job=download_job)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to upload the file"
        fail_download(download_job, e, exc_msg)
        if isinstance(e, InvalidParameterException):
            raise InvalidParameterException(e)
        else:
            raise Exception(download_job.error_message) from e
    finally:
        # Remove generated file
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            os.remove(zip_file_path)

    return finish_download(download_job)
def generate_download(download_job: DownloadJob,
                      origination: Optional[str] = None):
    """Create data archive files from the download job object"""

    # Parse data from download_job
    json_request = json.loads(download_job.json_request)
    columns = json_request.get("columns", None)
    limit = json_request.get("limit", None)
    piid = json_request.get("piid", None)
    award_id = json_request.get("award_id")
    assistance_id = json_request.get("assistance_id")
    file_format = json_request.get("file_format")
    request_type = json_request.get("request_type")

    span = tracer.current_span()
    if span and request_type:
        span.resource = request_type

    file_name = start_download(download_job)
    working_dir = None
    try:
        # Create temporary files and working directory
        zip_file_path = settings.CSV_LOCAL_PATH + file_name
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            # Clean up a zip file that might exist from a prior attempt at this download
            os.remove(zip_file_path)
        working_dir = os.path.splitext(zip_file_path)[0]
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)

        write_to_log(message=f"Generating {file_name}",
                     download_job=download_job)

        # Generate sources from the JSON request object
        sources = get_download_sources(json_request, origination)
        for source in sources:
            # Parse and write data to the file; if there are no matching columns for a source then add an empty file
            source_column_count = len(source.columns(columns))
            if source_column_count == 0:
                create_empty_data_file(source, download_job, working_dir, piid,
                                       assistance_id, zip_file_path,
                                       file_format)
            else:
                download_job.number_of_columns += source_column_count
                parse_source(source, columns, download_job, working_dir, piid,
                             assistance_id, zip_file_path, limit, file_format)
        include_data_dictionary = json_request.get("include_data_dictionary")
        if include_data_dictionary:
            add_data_dictionary_to_zip(working_dir, zip_file_path)
        include_file_description = json_request.get("include_file_description")
        if include_file_description:
            write_to_log(message="Adding file description to zip file")
            file_description = build_file_description(
                include_file_description["source"], sources)
            file_description = file_description.replace(
                "[AWARD_ID]", str(award_id))
            file_description_path = save_file_description(
                working_dir, include_file_description["destination"],
                file_description)
            append_files_to_zip_file([file_description_path], zip_file_path)
        download_job.file_size = os.stat(zip_file_path).st_size
    except InvalidParameterException as e:
        exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise InvalidParameterException(e)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise Exception(download_job.error_message) from e
    finally:
        # Remove working directory
        if working_dir and os.path.exists(working_dir):
            shutil.rmtree(working_dir)
        _kill_spawned_processes(download_job)

    # push file to S3 bucket, if not local
    if not settings.IS_LOCAL:
        with tracer.trace(
                name=f"job.{JOB_TYPE}.download.s3",
                service="bulk-download",
                resource=f"s3://{settings.BULK_DOWNLOAD_S3_BUCKET_NAME}",
                span_type=SpanTypes.WORKER,
        ) as span, tracer.trace(
                name="s3.command",
                service="aws.s3",
                resource=".".join([
                    multipart_upload.__module__,
                    (multipart_upload.__qualname__
                     or multipart_upload.__name__)
                ]),
                span_type=SpanTypes.WEB,
        ) as s3_span:
            # NOTE: Traces still not auto-picking-up aws.s3 service upload activity
            # Could be that the patches for boto and botocore don't cover the newer boto3 S3Transfer upload approach
            span.set_tag("file_name", file_name)
            try:
                bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
                region = settings.USASPENDING_AWS_REGION
                s3_span.set_tags({
                    "bucket": bucket,
                    "region": region,
                    "file": zip_file_path
                })
                start_uploading = time.perf_counter()
                multipart_upload(bucket, region, zip_file_path,
                                 os.path.basename(zip_file_path))
                write_to_log(
                    message=
                    f"Uploading took {time.perf_counter() - start_uploading:.2f}s",
                    download_job=download_job)
            except Exception as e:
                # Set error message; job_status_id will be set in download_sqs_worker.handle()
                exc_msg = "An exception was raised while attempting to upload the file"
                fail_download(download_job, e, exc_msg)
                if isinstance(e, InvalidParameterException):
                    raise InvalidParameterException(e)
                else:
                    raise Exception(download_job.error_message) from e
            finally:
                # Remove generated file
                if os.path.exists(zip_file_path):
                    os.remove(zip_file_path)
                _kill_spawned_processes(download_job)

    return finish_download(download_job)