Python RetrieveFileFromUri 예제들, dataactbroker.helpers.uri_helper.RetrieveFileFromUri Python 예제들

예제 #1

0

파일 보기

def test_http_uri(requests_mock):
    """ Tests a URI using a url """
    mock_obj = Mock()
    mock_obj.content = b'test,content'
    requests_mock.return_value = mock_obj

    with RetrieveFileFromUri(
            'http://this.is.url/file.csv').get_file_object() as url_file:
        first_line = url_file.readline()
        assert first_line == b'test,content'

    with RetrieveFileFromUri(
            'https://this.is.url/file.csv').get_file_object() as url_file:
        first_line = url_file.readline()
        assert first_line == b'test,content'

예제 #2

0

파일 보기

def test_file_uri():
    """ Tests a URI pointing to a file """
    file_path = 'tests/integration/data/file_content.csv'
    with RetrieveFileFromUri(file_path,
                             binary_data=False).get_file_object() as fabs_file:
        first_line = fabs_file.readline()
        assert first_line == 'test,content\n'

    file_path = 'file://' + os.path.join(CONFIG_BROKER['path'], 'tests',
                                         'integration', 'data',
                                         'file_content.csv')
    with RetrieveFileFromUri(file_path,
                             binary_data=False).get_file_object() as fabs_file:
        first_line = fabs_file.readline()
        assert first_line == 'test,content\n'

예제 #3

0

파일 보기

파일: load_location_data.py 프로젝트: fedspendingtransparency/data-act-broker-backend

def load_state_data(force_reload):
    """ Load data into the States table

        Args:
            force_reload: boolean to determine if reload should happen whether there are differences or not
    """
    start_time = datetime.now()
    state_file_url = '{}/state_list.csv'.format(CONFIG_BROKER['usas_public_reference_url'])
    with RetrieveFileFromUri(state_file_url, 'r').get_file_object() as state_file:
        new_data = parse_state_file(state_file)

    diff_found = check_dataframe_diff(new_data, States, ['states_id'], ['state_code'])

    if force_reload or diff_found:
        sess = GlobalDB.db().session
        logger.info('Differences found or reload forced, reloading states table.')
        # delete any data in the States table
        sess.query(States).delete()

        # insert data into table
        num = insert_dataframe(new_data, States.__table__.name, sess.connection())
        logger.info('{} records inserted to states'.format(num))
        sess.commit()
        update_external_data_load_date(start_time, datetime.now(), 'state_code')
    else:
        logger.info('No differences found, skipping states table reload.')

예제 #4

0

파일 보기

def test_bad_scheme():
    """ Tests an invalid scheme """
    error_text = "Scheme 'ftp' isn't supported. Try one of these: ('http', 'https', 's3', 'file', '')"

    with pytest.raises(NotImplementedError) as resp_except:
        RetrieveFileFromUri('ftp://this.is.a.bad.scheme')

    assert str(resp_except.value) == error_text

예제 #5

0

파일 보기

파일: duns.py 프로젝트: usdigitalresponse/data-act-broker-backend

def get_client(ssh_key=None):
    """ Connects to the SAM client and returns a usable object for interaction

        Arguments:
            ssh_key: private ssh key to connect to the secure API

        Returns:
            client object to interact with the SAM service
    """
    sam_config = CONFIG_BROKER.get('sam_duns')
    if not sam_config:
        return None

    if ssh_key:
        host = sam_config.get('host_ssh')
        username = sam_config.get('username_ssh')
        password = sam_config.get('password_ssh')

        ssh_key_file = RetrieveFileFromUri(
            ssh_key, binary_data=False).get_file_object()
        pkey = paramiko.RSAKey.from_private_key(
            ssh_key_file, password=sam_config.get('ssh_key_password'))

    else:
        host = sam_config.get('host')
        username = sam_config.get('username')
        password = sam_config.get('password')
        pkey = None

    if None in (host, username, password) or ssh_key and not pkey:
        raise Exception("Missing config elements for connecting to SAM")

    client = paramiko.SSHClient()
    client.load_system_host_keys()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(hostname=host,
                   username=username,
                   password=password,
                   pkey=pkey)
    return client

예제 #6

0

파일 보기

파일: load_historical_error_metadata.py 프로젝트: fedspendingtransparency/data-act-broker-backend

def move_updated_error_metadata(sess):
    """ Moving the last published error metadata for updated submissions.

        Args:
            sess: connection to database
    """
    logger.info('Moving updated error metadata')
    # Get a list of all jobs for updated submissions (these can't be FABS but we'll filter in case there's a bug)
    updated_job_list = sess.query(Job.job_id).join(Submission, Job.submission_id == Submission.submission_id). \
        filter(Submission.is_fabs.is_(False), Submission.publish_status_id == PUBLISH_STATUS_DICT['updated']). \
        all()

    # Delete all current updated entries to prevent duplicates
    sess.query(PublishedErrorMetadata).filter(PublishedErrorMetadata.job_id.in_(updated_job_list)). \
        delete(synchronize_session=False)

    # Create a CTE of the max publish history IDs for updated submissions (DABS only)
    max_publish_history = sess.query(func.max(PublishHistory.publish_history_id).label('max_publish_id'),
                                     PublishHistory.submission_id.label('submission_id')).\
        join(Submission, PublishHistory.submission_id == Submission.submission_id).\
        filter(Submission.publish_status_id == PUBLISH_STATUS_DICT['updated'], Submission.is_fabs.is_(False)).\
        group_by(PublishHistory.submission_id).cte('max_publish_history')

    # Get the publish history associated with all of the warning files
    publish_history_list = sess.query(PublishedFilesHistory.publish_history_id, PublishedFilesHistory.submission_id,
                                      PublishedFilesHistory.warning_filename).\
        join(max_publish_history, max_publish_history.c.max_publish_id == PublishedFilesHistory.publish_history_id).\
        filter(PublishedFilesHistory.warning_filename.isnot(None)).order_by(PublishedFilesHistory.submission_id).\
        distinct()

    # Creating temporary error table and truncating in case something went wrong in this script before
    create_table_sql = """
            CREATE TABLE IF NOT EXISTS temp_error_file (
                field_name text,
                error_message text,
                row_number integer,
                value_provided text,
                rule_label text,
                source_file integer,
                target_file integer,
                job_id integer,
                severity_id integer,
                filename text,
                error_type_id integer
            );
        """
    sess.execute(create_table_sql)
    sess.execute('TRUNCATE TABLE temp_error_file')
    sess.commit()

    # Loop through each unique publish history to get relevant details
    for publish_history in publish_history_list:
        logger.info('Moving error metadata from file: {}'.format(
            publish_history.warning_filename))
        warning_file_path = publish_history.warning_filename
        file_name = os.path.basename(warning_file_path)

        # If it's not local, we need to add the bucket to the stored path
        if not CONFIG_BROKER['local']:
            warning_file_path = 's3://' + CONFIG_BROKER[
                'certified_bucket'] + '/' + warning_file_path

        with RetrieveFileFromUri(warning_file_path,
                                 'r').get_file_object() as warning_file:
            warning_df = pd.read_csv(warning_file, dtype=str)

        # Only bother processing if there's actual data in the warning file
        if not warning_df.empty:
            # Cross-file and single file validations are slightly different so we have to treat them differently
            if 'cross_warning' in warning_file_path:
                field_map = {
                    'Field names': 'field_name',
                    'Error message': 'error_message',
                    'Row number': 'row_number',
                    'Values provided': 'value_provided',
                    'Rule label': 'rule_label',
                    'Source File': 'source_file',
                    'Target File': 'target_file'
                }
                relevant_job = sess.query(Job).filter_by(
                    submission_id=publish_history.submission_id,
                    job_type_id=JOB_TYPE_DICT['validation']).one()
                warning_df['filename'] = 'cross_file'
            else:
                field_map = {
                    'Field name': 'field_name',
                    'Error message': 'error_message',
                    'Row number': 'row_number',
                    'Value provided': 'value_provided',
                    'Rule label': 'rule_label'
                }

                file_type_match = re.match(
                    'submission_{}_(.+)_warning_report.csv'.format(
                        publish_history.submission_id), file_name)
                file_type = file_type_match.groups()[0]
                warning_df['source_file'] = file_type
                warning_df['target_file'] = None
                relevant_job = sess.query(Job).filter_by(
                    submission_id=publish_history.submission_id,
                    job_type_id=JOB_TYPE_DICT['csv_record_validation'],
                    file_type_id=FILE_TYPE_DICT[file_type]).one()
                warning_df['filename'] = relevant_job.filename

            warning_df['job_id'] = relevant_job.job_id
            warning_df['severity_id'] = RULE_SEVERITY_DICT['warning']
            warning_df.rename(columns=field_map, inplace=True)

            warning_df['source_file'] = warning_df.apply(
                lambda x: convert_file_type_to_int(x, 'source_file'), axis=1)
            warning_df['target_file'] = warning_df.apply(
                lambda x: convert_file_type_to_int(x, 'target_file'), axis=1)
            warning_df['error_type_id'] = warning_df.apply(
                lambda x: derive_error_type_id(x), axis=1)
            # Replace the word "None" anywhere in the dataframe with an actual None
            warning_df = warning_df.replace('None', np.nan)
            insert_dataframe(warning_df, 'temp_error_file', sess.connection())
            sess.commit()

            # Transfer contents of file to published error metadata
            insert_sql = """
                INSERT INTO published_error_metadata (
                    created_at,
                    updated_at,
                    job_id,
                    filename,
                    field_name,
                    error_type_id,
                    occurrences,
                    first_row,
                    rule_failed,
                    file_type_id,
                    target_file_type_id,
                    original_rule_label,
                    severity_id
                )
                SELECT
                    NOW(),
                    NOW(),
                    job_id,
                    filename,
                    field_name,
                    error_type_id,
                    COUNT(1),
                    MIN(row_number),
                    error_message,
                    source_file,
                    target_file,
                    rule_label,
                    severity_id
                FROM temp_error_file
                GROUP BY
                    job_id,
                    filename,
                    field_name,
                    error_type_id,
                    error_message,
                    source_file,
                    target_file,
                    rule_label,
                    severity_id
            """

            sess.execute(insert_sql)
            sess.execute('TRUNCATE TABLE temp_error_file')
            sess.commit()

    sess.execute('DROP TABLE temp_error_file')
    sess.commit()
    logger.info('Updated error metadata moved')