def test_http_uri(requests_mock): """ Tests a URI using a url """ mock_obj = Mock() mock_obj.content = b'test,content' requests_mock.return_value = mock_obj with RetrieveFileFromUri( 'http://this.is.url/file.csv').get_file_object() as url_file: first_line = url_file.readline() assert first_line == b'test,content' with RetrieveFileFromUri( 'https://this.is.url/file.csv').get_file_object() as url_file: first_line = url_file.readline() assert first_line == b'test,content'
def test_file_uri(): """ Tests a URI pointing to a file """ file_path = 'tests/integration/data/file_content.csv' with RetrieveFileFromUri(file_path, binary_data=False).get_file_object() as fabs_file: first_line = fabs_file.readline() assert first_line == 'test,content\n' file_path = 'file://' + os.path.join(CONFIG_BROKER['path'], 'tests', 'integration', 'data', 'file_content.csv') with RetrieveFileFromUri(file_path, binary_data=False).get_file_object() as fabs_file: first_line = fabs_file.readline() assert first_line == 'test,content\n'
def load_state_data(force_reload): """ Load data into the States table Args: force_reload: boolean to determine if reload should happen whether there are differences or not """ start_time = datetime.now() state_file_url = '{}/state_list.csv'.format(CONFIG_BROKER['usas_public_reference_url']) with RetrieveFileFromUri(state_file_url, 'r').get_file_object() as state_file: new_data = parse_state_file(state_file) diff_found = check_dataframe_diff(new_data, States, ['states_id'], ['state_code']) if force_reload or diff_found: sess = GlobalDB.db().session logger.info('Differences found or reload forced, reloading states table.') # delete any data in the States table sess.query(States).delete() # insert data into table num = insert_dataframe(new_data, States.__table__.name, sess.connection()) logger.info('{} records inserted to states'.format(num)) sess.commit() update_external_data_load_date(start_time, datetime.now(), 'state_code') else: logger.info('No differences found, skipping states table reload.')
def test_bad_scheme(): """ Tests an invalid scheme """ error_text = "Scheme 'ftp' isn't supported. Try one of these: ('http', 'https', 's3', 'file', '')" with pytest.raises(NotImplementedError) as resp_except: RetrieveFileFromUri('ftp://this.is.a.bad.scheme') assert str(resp_except.value) == error_text
def get_client(ssh_key=None): """ Connects to the SAM client and returns a usable object for interaction Arguments: ssh_key: private ssh key to connect to the secure API Returns: client object to interact with the SAM service """ sam_config = CONFIG_BROKER.get('sam_duns') if not sam_config: return None if ssh_key: host = sam_config.get('host_ssh') username = sam_config.get('username_ssh') password = sam_config.get('password_ssh') ssh_key_file = RetrieveFileFromUri( ssh_key, binary_data=False).get_file_object() pkey = paramiko.RSAKey.from_private_key( ssh_key_file, password=sam_config.get('ssh_key_password')) else: host = sam_config.get('host') username = sam_config.get('username') password = sam_config.get('password') pkey = None if None in (host, username, password) or ssh_key and not pkey: raise Exception("Missing config elements for connecting to SAM") client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) client.connect(hostname=host, username=username, password=password, pkey=pkey) return client
def move_updated_error_metadata(sess): """ Moving the last published error metadata for updated submissions. Args: sess: connection to database """ logger.info('Moving updated error metadata') # Get a list of all jobs for updated submissions (these can't be FABS but we'll filter in case there's a bug) updated_job_list = sess.query(Job.job_id).join(Submission, Job.submission_id == Submission.submission_id). \ filter(Submission.is_fabs.is_(False), Submission.publish_status_id == PUBLISH_STATUS_DICT['updated']). \ all() # Delete all current updated entries to prevent duplicates sess.query(PublishedErrorMetadata).filter(PublishedErrorMetadata.job_id.in_(updated_job_list)). \ delete(synchronize_session=False) # Create a CTE of the max publish history IDs for updated submissions (DABS only) max_publish_history = sess.query(func.max(PublishHistory.publish_history_id).label('max_publish_id'), PublishHistory.submission_id.label('submission_id')).\ join(Submission, PublishHistory.submission_id == Submission.submission_id).\ filter(Submission.publish_status_id == PUBLISH_STATUS_DICT['updated'], Submission.is_fabs.is_(False)).\ group_by(PublishHistory.submission_id).cte('max_publish_history') # Get the publish history associated with all of the warning files publish_history_list = sess.query(PublishedFilesHistory.publish_history_id, PublishedFilesHistory.submission_id, PublishedFilesHistory.warning_filename).\ join(max_publish_history, max_publish_history.c.max_publish_id == PublishedFilesHistory.publish_history_id).\ filter(PublishedFilesHistory.warning_filename.isnot(None)).order_by(PublishedFilesHistory.submission_id).\ distinct() # Creating temporary error table and truncating in case something went wrong in this script before create_table_sql = """ CREATE TABLE IF NOT EXISTS temp_error_file ( field_name text, error_message text, row_number integer, value_provided text, rule_label text, source_file integer, target_file integer, job_id integer, severity_id integer, filename text, error_type_id integer ); """ sess.execute(create_table_sql) sess.execute('TRUNCATE TABLE temp_error_file') sess.commit() # Loop through each unique publish history to get relevant details for publish_history in publish_history_list: logger.info('Moving error metadata from file: {}'.format( publish_history.warning_filename)) warning_file_path = publish_history.warning_filename file_name = os.path.basename(warning_file_path) # If it's not local, we need to add the bucket to the stored path if not CONFIG_BROKER['local']: warning_file_path = 's3://' + CONFIG_BROKER[ 'certified_bucket'] + '/' + warning_file_path with RetrieveFileFromUri(warning_file_path, 'r').get_file_object() as warning_file: warning_df = pd.read_csv(warning_file, dtype=str) # Only bother processing if there's actual data in the warning file if not warning_df.empty: # Cross-file and single file validations are slightly different so we have to treat them differently if 'cross_warning' in warning_file_path: field_map = { 'Field names': 'field_name', 'Error message': 'error_message', 'Row number': 'row_number', 'Values provided': 'value_provided', 'Rule label': 'rule_label', 'Source File': 'source_file', 'Target File': 'target_file' } relevant_job = sess.query(Job).filter_by( submission_id=publish_history.submission_id, job_type_id=JOB_TYPE_DICT['validation']).one() warning_df['filename'] = 'cross_file' else: field_map = { 'Field name': 'field_name', 'Error message': 'error_message', 'Row number': 'row_number', 'Value provided': 'value_provided', 'Rule label': 'rule_label' } file_type_match = re.match( 'submission_{}_(.+)_warning_report.csv'.format( publish_history.submission_id), file_name) file_type = file_type_match.groups()[0] warning_df['source_file'] = file_type warning_df['target_file'] = None relevant_job = sess.query(Job).filter_by( submission_id=publish_history.submission_id, job_type_id=JOB_TYPE_DICT['csv_record_validation'], file_type_id=FILE_TYPE_DICT[file_type]).one() warning_df['filename'] = relevant_job.filename warning_df['job_id'] = relevant_job.job_id warning_df['severity_id'] = RULE_SEVERITY_DICT['warning'] warning_df.rename(columns=field_map, inplace=True) warning_df['source_file'] = warning_df.apply( lambda x: convert_file_type_to_int(x, 'source_file'), axis=1) warning_df['target_file'] = warning_df.apply( lambda x: convert_file_type_to_int(x, 'target_file'), axis=1) warning_df['error_type_id'] = warning_df.apply( lambda x: derive_error_type_id(x), axis=1) # Replace the word "None" anywhere in the dataframe with an actual None warning_df = warning_df.replace('None', np.nan) insert_dataframe(warning_df, 'temp_error_file', sess.connection()) sess.commit() # Transfer contents of file to published error metadata insert_sql = """ INSERT INTO published_error_metadata ( created_at, updated_at, job_id, filename, field_name, error_type_id, occurrences, first_row, rule_failed, file_type_id, target_file_type_id, original_rule_label, severity_id ) SELECT NOW(), NOW(), job_id, filename, field_name, error_type_id, COUNT(1), MIN(row_number), error_message, source_file, target_file, rule_label, severity_id FROM temp_error_file GROUP BY job_id, filename, field_name, error_type_id, error_message, source_file, target_file, rule_label, severity_id """ sess.execute(insert_sql) sess.execute('TRUNCATE TABLE temp_error_file') sess.commit() sess.execute('DROP TABLE temp_error_file') sess.commit() logger.info('Updated error metadata moved')