def test_read_file(self): """test reading files""" contents = util.read_file( resolve_test_data_path('data/20040709.ECC.2Z.2ZL1.NOAA-CMDL.csv')) self.assertIsInstance(contents, str) contents = util.read_file( resolve_test_data_path('data/wmo_acronym_vertical_sm.jpg')) self.assertIsInstance(contents, str) with self.assertRaises(FileNotFoundError): contents = util.read_file('404file.dat')
def test_mixed_run_report(self): """ Test that passing and failing files are written to the run report when a mixture of the two is processed """ infile_root = resolve_test_data_path('data/general/pass_and_fail') agency = 'MSC' expected_passes = set() expected_fails = set() with report.OperatorReport() as error_bank: run_report = report.RunReport(SANDBOX_DIR) for infile in os.listdir(infile_root): fullpath = os.path.join(infile_root, infile) try: contents = util.read_file(fullpath) ecsv = parser.ExtendedCSV(contents, error_bank) except (parser.MetadataValidationError, parser.NonStandardDataError): expected_fails.add(fullpath) run_report.write_failing_file(fullpath, agency) continue try: ecsv.validate_metadata_tables() ecsv.validate_dataset_tables() data_record = models.DataRecord(ecsv) data_record.filename = infile expected_passes.add(fullpath) run_report.write_passing_file(fullpath, agency) except (parser.MetadataValidationError, parser.NonStandardDataError): expected_fails.add(fullpath) run_report.write_failing_file(fullpath, agency) self.assertEquals(len(expected_passes), 6) self.assertEquals(len(expected_fails), 4) output_path = os.path.join(SANDBOX_DIR, 'run_report') self.assertTrue(os.path.exists(output_path)) with open(output_path) as output: lines = output.read().splitlines() self.assertEquals(lines[0], agency) self.assertEquals(len(lines), len(expected_passes) + len(expected_fails) + 1) for line in lines[1:]: if line.startswith('Pass'): target = line[6:].strip() self.assertIn(target, expected_passes) elif line.startswith('Fail'): target = line[6:].strip() self.assertIn(target, expected_fails)
def test_non_extcsv_run_report(self): """Test that an unparseable file is written to the run report""" filename = 'not-an-ecsv.dat' infile = resolve_test_data_path('data/general/{}'.format(filename)) contents = util.read_file(infile) agency = 'UNKNOWN' with report.OperatorReport() as error_bank: run_report = report.RunReport(SANDBOX_DIR) try: _ = parser.ExtendedCSV(contents, error_bank) raise AssertionError( 'Parsing of {} did not fail'.format(infile)) except (parser.MetadataValidationError, parser.NonStandardDataError): output_path = os.path.join(SANDBOX_DIR, 'run_report') run_report.write_failing_file(infile, agency) self.assertTrue(os.path.exists(output_path)) with open(output_path) as output: lines = output.read().splitlines() self.assertEquals(len(lines), 2) self.assertEquals(lines[0], agency) self.assertEquals(lines[1], 'Fail: {}'.format(infile))
def test_failing_run_report(self): """Test that a failing file is written to the run report""" filename = 'ecsv-missing-instrument-name.csv' infile = resolve_test_data_path('data/general/{}'.format(filename)) contents = util.read_file(infile) ecsv = None # Agency typically filled in with FTP username for failing files. agency = 'rmda' with report.OperatorReport() as error_bank: run_report = report.RunReport(SANDBOX_DIR) try: ecsv = parser.ExtendedCSV(contents, error_bank) ecsv.validate_metadata_tables() agency = ecsv.extcsv['DATA_GENERATION']['Agency'] ecsv.validate_dataset_tables() raise AssertionError( 'Parsing of {} did not fail'.format(infile)) except (parser.MetadataValidationError, parser.NonStandardDataError): output_path = os.path.join(SANDBOX_DIR, 'run_report') run_report.write_failing_file(infile, agency) self.assertTrue(os.path.exists(output_path)) with open(output_path) as output: lines = output.read().splitlines() self.assertEquals(len(lines), 2) self.assertEquals(lines[0], agency) self.assertEquals(lines[1], 'Fail: {}'.format(infile))
def test_passing_run_report(self): """Test that a passing file is written to the run report""" filename = '20080101.Kipp_Zonen.UV-S-E-T.000560.PMOD-WRC.csv' infile = resolve_test_data_path('data/general/{}'.format(filename)) contents = util.read_file(infile) run_report = report.RunReport(SANDBOX_DIR) with report.OperatorReport() as error_bank: ecsv = parser.ExtendedCSV(contents, error_bank) ecsv.validate_metadata_tables() ecsv.validate_dataset_tables() data_record = models.DataRecord(ecsv) data_record.filename = filename agency = ecsv.extcsv['DATA_GENERATION']['Agency'] output_path = os.path.join(SANDBOX_DIR, 'run_report') run_report.write_passing_file(infile, agency) self.assertTrue(os.path.exists(output_path)) with open(output_path) as output: lines = output.read().splitlines() self.assertEquals(len(lines), 2) self.assertEquals(lines[0], agency) self.assertEquals(lines[1], 'Pass: {}'.format(infile))
def test_warning_operator_report(self): """Test that file warnings are written in the operator report""" filename = 'ecsv-trailing-commas.csv' infile = resolve_test_data_path('data/general/{}'.format(filename)) contents = util.read_file(infile) with report.OperatorReport(SANDBOX_DIR) as op_report: ecsv = parser.ExtendedCSV(contents, op_report) # Some warnings are encountered during parsing. ecsv.validate_metadata_tables() ecsv.validate_dataset_tables() data_record = models.DataRecord(ecsv) data_record.filename = filename agency = ecsv.extcsv['DATA_GENERATION']['Agency'] today = datetime.now().strftime('%Y-%m-%d') output_path = os.path.join(SANDBOX_DIR, 'operator-report-{}.csv'.format(today)) op_report.add_message(200) # File passes validation op_report.write_passing_file(infile, ecsv, data_record) self.assertTrue(os.path.exists(output_path)) with open(output_path) as output: reader = csv.reader(output) next(reader) expected_warnings = len(ecsv.warnings) for _ in range(expected_warnings): report_line = next(reader) self.assertEquals(report_line[0], 'P') self.assertEquals(report_line[1], 'Warning') self.assertIn(agency, report_line) self.assertIn(os.path.basename(infile), report_line) report_line = next(reader) self.assertEquals(report_line[0], 'P') self.assertEquals(report_line[1], 'Warning') self.assertEquals(report_line[2], '200') self.assertIn(agency, report_line) self.assertIn(os.path.basename(infile), report_line) with self.assertRaises(StopIteration): next(reader)
def test_passing_operator_report(self): """Test that a passing file is written in the operator report""" filename = '20080101.Kipp_Zonen.UV-S-E-T.000560.PMOD-WRC.csv' infile = resolve_test_data_path('data/general/{}'.format(filename)) contents = util.read_file(infile) with report.OperatorReport(SANDBOX_DIR) as op_report: ecsv = parser.ExtendedCSV(contents, op_report) ecsv.validate_metadata_tables() ecsv.validate_dataset_tables() data_record = models.DataRecord(ecsv) data_record.filename = filename agency = ecsv.extcsv['DATA_GENERATION']['Agency'] today = datetime.now().strftime('%Y-%m-%d') output_path = os.path.join(SANDBOX_DIR, 'operator-report-{}.csv'.format(today)) op_report.add_message(200) # File passes validation op_report.write_passing_file(infile, ecsv, data_record) self.assertTrue(os.path.exists(output_path)) with open(output_path) as output: reader = csv.reader(output) next(reader) report_line = next(reader) self.assertEquals(report_line[0], 'P') self.assertEquals(report_line[2], '200') self.assertIn(agency, report_line) self.assertIn(os.path.basename(infile), report_line) with self.assertRaises(StopIteration): next(reader)
def test_run_report_multiple_agencies(self): """Test that files in the run report are grouped by agency""" infile_root = resolve_test_data_path('data/general/agencies') expected_passes = {} expected_fails = {} agency_aliases = { 'msc': 'MSC', 'casiap': 'CAS-IAP', 'mlcd-lu': 'MLCD-LU', 'dwd-mohp': 'DWD-MOHp' } with report.OperatorReport() as error_bank: run_report = report.RunReport(SANDBOX_DIR) for dirpath, dirnames, filenames in os.walk(infile_root): for infile in filenames: fullpath = os.path.join(dirpath, infile) # Agency inferred from directory name. agency = dirpath.split('/')[-1] try: contents = util.read_file(fullpath) ecsv = parser.ExtendedCSV(contents, error_bank) except (parser.MetadataValidationError, parser.NonStandardDataError): if agency not in expected_passes: expected_passes[agency] = set() if agency not in expected_fails: expected_fails[agency] = set() expected_fails[agency].add(fullpath) run_report.write_failing_file(fullpath, agency) continue try: ecsv.validate_metadata_tables() agency = ecsv.extcsv['DATA_GENERATION']['Agency'] if agency not in expected_passes: expected_passes[agency] = set() if agency not in expected_fails: expected_fails[agency] = set() ecsv.validate_dataset_tables() data_record = models.DataRecord(ecsv) data_record.filename = infile expected_passes[agency].add(fullpath) run_report.write_passing_file(fullpath, agency) except (parser.MetadataValidationError, parser.NonStandardDataError): agency = agency_aliases[agency] if agency not in expected_passes: expected_passes[agency] = set() if agency not in expected_fails: expected_fails[agency] = set() expected_fails[agency].add(fullpath) run_report.write_failing_file(fullpath, agency) self.assertEquals(len(expected_passes['CAS-IAP']), 1) self.assertEquals(len(expected_passes['DWD-MOHp']), 2) self.assertEquals(len(expected_passes['MLCD-LU']), 3) self.assertEquals(len(expected_passes['MSC']), 4) self.assertEquals(len(expected_fails['CAS-IAP']), 0) self.assertEquals(len(expected_fails['DWD-MOHp']), 1) self.assertEquals(len(expected_fails['MLCD-LU']), 0) self.assertEquals(len(expected_fails['MSC']), 1) output_path = os.path.join(SANDBOX_DIR, 'run_report') self.assertTrue(os.path.exists(output_path)) with open(output_path) as output: lines = output.read().splitlines() curr_agency = None for line in lines: if line.startswith('Pass'): target = line[6:] self.assertIn(target, expected_passes[curr_agency]) elif line.startswith('Fail'): target = line[6:] self.assertIn(target, expected_fails[curr_agency]) elif line.strip() != '': curr_agency = line.strip() self.assertIn(line, agency_aliases.values())
def test_mixed_operator_report(self): """ Test that passing and failing files are written to the operator report when a mixture of the two is processed """ infile_root = resolve_test_data_path('data/general/pass_and_fail') warnings = {} errors = {} expected_warnings = {} expected_errors = {} agency = 'UNKNOWN' with report.OperatorReport(SANDBOX_DIR) as op_report: for infile in os.listdir(infile_root): fullpath = os.path.join(infile_root, infile) warnings[fullpath] = 0 errors[fullpath] = 0 try: contents = util.read_file(fullpath) ecsv = parser.ExtendedCSV(contents, op_report) except (parser.MetadataValidationError, parser.NonStandardDataError) as err: expected_errors[fullpath] = len(err.errors) op_report.add_message(209) op_report.write_failing_file(fullpath, agency) continue try: ecsv.validate_metadata_tables() agency = ecsv.extcsv['DATA_GENERATION']['Agency'] ecsv.validate_dataset_tables() data_record = models.DataRecord(ecsv) data_record.filename = infile expected_warnings[fullpath] = len(ecsv.warnings) expected_errors[fullpath] = 0 op_report.write_passing_file(fullpath, ecsv, data_record) except (parser.MetadataValidationError, parser.NonStandardDataError): expected_warnings[fullpath] = len(ecsv.warnings) expected_errors[fullpath] = len(ecsv.errors) op_report.add_message(209) op_report.write_failing_file(fullpath, agency, ecsv) today = datetime.now().strftime('%Y-%m-%d') output_path = os.path.join(SANDBOX_DIR, 'operator-report-{}.csv'.format(today)) self.assertTrue(os.path.exists(output_path)) with open(output_path) as output: reader = csv.reader(output) next(reader) for line in reader: if expected_errors[line[12]] == 0: self.assertEquals(line[0], 'P') self.assertEquals(line[1], 'Warning') else: self.assertEquals(line[0], 'F') if line[2] == '200': self.assertEquals(expected_errors[line[12]], 0) elif line[2] == '209': self.assertGreater(expected_errors[line[12]], 0) elif line[1] == 'Warning': warnings[line[12]] += 1 elif line[1] == 'Error': errors[line[12]] += 1 self.assertEquals(warnings, expected_warnings) self.assertEquals(errors, expected_errors)
def test_failing_operator_report(self): """Test that a failing file is written in the operator report""" filename = 'ecsv-missing-instrument-name.csv' infile = resolve_test_data_path('data/general/{}'.format(filename)) contents = util.read_file(infile) ecsv = None agency = 'UNKNOWN' with report.OperatorReport(SANDBOX_DIR) as op_report: try: ecsv = parser.ExtendedCSV(contents, op_report) ecsv.validate_metadata_tables() agency = ecsv.extcsv['DATA_GENERATION']['Agency'] ecsv.validate_dataset_tables() raise AssertionError( 'Parsing of {} did not fail'.format(infile)) except (parser.MetadataValidationError, parser.NonStandardDataError): output_path = os.path.join(SANDBOX_DIR, 'run1') op_report.add_message(209) op_report.write_failing_file(infile, agency, ecsv) today = datetime.now().strftime('%Y-%m-%d') output_path = os.path.join(SANDBOX_DIR, 'operator-report-{}.csv'.format(today)) self.assertTrue(os.path.exists(output_path)) with open(output_path) as output: reader = csv.reader(output) next(reader) warnings = 0 errors = 0 expected_warnings = len(ecsv.warnings) expected_errors = len(ecsv.errors) for _ in range(expected_warnings + expected_errors): report_line = next(reader) self.assertEquals(report_line[0], 'F') if report_line[1] == 'Warning': warnings += 1 elif report_line[1] == 'Error': errors += 1 self.assertEquals(warnings, expected_warnings) self.assertEquals(errors, expected_errors) report_line = next(reader) self.assertEquals(report_line[0], 'F') self.assertEquals(report_line[1], 'Error') self.assertEquals(report_line[2], '209') self.assertIn(agency, report_line) self.assertIn(os.path.basename(infile), report_line) with self.assertRaises(StopIteration): next(reader)
def process_data(self, infile, verify_only=False, bypass=False): """ process incoming data record :param infile: incoming filepath :param verify_only: perform verification only (no ingest) :param bypass: skip permission prompts :returns: `bool` of processing result """ # detect incoming data file data = None self.data_record = None self.search_engine = search.SearchIndex() LOGGER.info('Processing file {}'.format(infile)) LOGGER.info('Detecting file') if not is_text_file(infile): self.status = 'failed' self.code = 'NonStandardDataError' self.message = 'binary file detected' LOGGER.error('Unknown file: {}'.format(self.message)) return False try: data = read_file(infile) except UnicodeDecodeError as err: self.status = 'failed' self.code = 'NonStandardDataError' self.message = err LOGGER.error('Unknown file: {}'.format(err)) return False LOGGER.info('Parsing data record') ecsv = ExtendedCSV(data) try: LOGGER.info('Validating Extended CSV') ecsv.validate_metadata() LOGGER.info('Valid Extended CSV') except NonStandardDataError as err: self.status = 'failed' self.code = 'NonStandardDataError' self.message = err LOGGER.error('Invalid Extended CSV: {}'.format(err)) return False except MetadataValidationError as err: self.status = 'failed' self.code = 'MetadataValidationError' self.message = err LOGGER.error('Invalid Extended CSV: {}'.format(err.errors)) return False LOGGER.info('Data is valid Extended CSV') self.data_record = DataRecord(ecsv) self.data_record.ingest_filepath = infile self.data_record.filename = os.path.basename(infile) self.data_record.url = self.data_record.get_waf_path( config.WDR_WAF_BASEURL) self.process_end = datetime.utcnow() LOGGER.debug('Verifying if URN already exists') results = self.registry.query_by_field(DataRecord, self.data_record, 'identifier') if results: msg = 'Data exists' self.status = 'failed' self.code = 'ProcessingError' self.message = msg LOGGER.error(msg) return False # domains_to_check = [ # 'content_category', # 'data_generation_agency', # 'platform_type', # 'platform_id', # 'platform_name', # 'platform_country', # 'instrument_name', # 'instrument_model' # ] # for domain_to_check in domains_to_check: # value = getattr(self.data_record, domain_to_check) # domain = getattr(DataRecord, domain_to_check) # # if value not in self.registry.query_distinct(domain): # msg = 'value {} not in domain {}'.format(value, # domain_to_check) # LOGGER.error(msg) # # raise ProcessingError(msg) LOGGER.info('Verifying data record against core metadata fields') LOGGER.debug('Validating project') self.projects = self.registry.query_distinct(Project.identifier) if self.data_record.content_class not in self.projects: msg = 'Project {} not found in registry'.format( self.data_record.content_class) LOGGER.error(msg) raise ProcessingError(msg) else: LOGGER.debug('Matched with project: {}'.format( self.data_record.content_class)) LOGGER.debug('Validating dataset') self.datasets = self.registry.query_distinct(Dataset.identifier) if self.data_record.content_category not in self.datasets: msg = 'Dataset {} not found in registry'.format( self.data_record.content_category) LOGGER.error(msg) raise ProcessingError(msg) else: LOGGER.debug('Matched with dataset: {}'.format( self.data_record.content_category)) LOGGER.debug('Validating contributor') self.contributors = self.registry.query_distinct( Contributor.identifier) file_contributor = '{}:{}'.format( self.data_record.data_generation_agency, self.data_record.content_class) if file_contributor not in self.contributors: msg = 'Contributor {} not found in registry'.format( file_contributor) LOGGER.error(msg) raise ProcessingError(msg) else: LOGGER.debug( 'Matched with contributor: {}'.format(file_contributor)) # TODO: consider adding and checking #PLATFORM_Type LOGGER.debug('Validating station data') station = { 'identifier': self.data_record.platform_id, 'name': self.data_record.platform_name, 'country_id': self.data_record.platform_country } LOGGER.debug('Validating station id...') results = self.registry.query_multiple_fields(Station, station, ['identifier']) if results: LOGGER.debug('Validated with id: {}'.format( self.data_record.platform_id)) else: msg = 'Station {} not found in registry'.format( self.data_record.platform_id) LOGGER.error(msg) raise ProcessingError(msg) LOGGER.debug('Validating station name...') fields = ['identifier', 'name'] results = self.registry.query_multiple_fields(Station, station, fields) if results: LOGGER.debug('Validated with name: {} for id: {}'.format( self.data_record.platform_name, self.data_record.platform_id)) else: msg = 'Station name: {} did not match data for id: {}'.format( self.data_record.platform_name, self.data_record.platform_id) LOGGER.error(msg) raise ProcessingError(msg) LOGGER.debug('Validating station country...') fields = ['identifier', 'country_id'] results = self.registry.query_multiple_fields(Station, station, fields) if results: LOGGER.debug('Validated with country: {} for id: {}'.format( self.data_record.platform_country, self.data_record.platform_id)) else: msg = 'Station country: {} did not match data for id: {}'.format( self.data_record.platform_country, self.data_record.platform_id) LOGGER.error(msg) raise ProcessingError(msg) LOGGER.debug('Validating instrument') self.instruments = self.registry.query_distinct(Instrument.identifier) instrument_added = False instrument = [ self.data_record.instrument_name, self.data_record.instrument_model, self.data_record.instrument_number, self.data_record.platform_id, self.data_record.content_category ] instrument_id = ':'.join(instrument) if instrument_id not in self.instruments: instrument[2] = str(int(instrument[2])) old_instrument_id = ':'.join(instrument) if old_instrument_id not in self.instruments: msg = 'Instrument {} not found in registry'.format( instrument_id) LOGGER.warning(msg) LOGGER.debug('Checking for new serial number...') instrument_added = self.new_serial(instrument_id, verify_only) if not instrument_added: if bypass: LOGGER.info('Bypass mode. Skipping permission check.') ins_data = self.get_instrument_data(instrument_id) instrument = Instrument(ins_data) self.registry.save(instrument) LOGGER.info('Instrument successfully added.') instrument_added = True else: response = input('Not instrument with new serial. Add' ' new instrument? (y/n)\n') if response == 'y': ins_data = self.get_instrument_data(instrument_id) instrument = Instrument(ins_data) self.registry.save(instrument) LOGGER.info('Instrument successfully added.') instrument_added = True else: msg = 'Instrument data for id:{} does not match '\ 'existing records.'.format(instrument_id) LOGGER.error(msg) raise ProcessingError(msg) LOGGER.debug('Updating instruments list.') self.instruments = self.registry.\ query_distinct(Instrument.identifier) else: instrument_id = old_instrument_id if instrument_added and verify_only: LOGGER.debug('Skipping location check due to instrument ' 'not being added in verification mode.') else: LOGGER.debug('Matched with instrument: {}'.format(instrument_id)) LOGGER.debug('Checking instrument location...') location = { 'identifier': instrument_id, 'x': self.data_record.x, 'y': self.data_record.y, 'z': self.data_record.z } results = self.registry.query_multiple_fields(Instrument, location) if results: LOGGER.debug('Instrument location validated.') else: msg = 'Instrument location does not match database records.' LOGGER.error(msg) raise ProcessingError(msg) LOGGER.debug('Validating agency deployment') deployment_id = ':'.join([ self.data_record.platform_id, self.data_record.data_generation_agency, self.data_record.content_class ]) data = { 'identifier': deployment_id, 'station_id': self.data_record.platform_id, 'contributor_id': file_contributor, 'start_date': self.data_record.timestamp_date, 'end_date': self.data_record.timestamp_date } deployment = self.registry.query_multiple_fields( Deployment, data, ['identifier']) if deployment: if deployment.start_date > self.data_record.timestamp_date: deployment.start_date = self.data_record.timestamp_date self.registry.save() LOGGER.debug('Deployment start date updated.') elif deployment.end_date < self.data_record.timestamp_date: deployment.end_date = self.data_record.timestamp_date self.registry.save() LOGGER.debug('Deployment end date updated.') LOGGER.debug('Deployment validated') else: LOGGER.warning('Deployment not found') if bypass: LOGGER.info('Bypass mode. Skipping permission check') deployment = Deployment(data) self.registry.save(deployment) LOGGER.warning('Deployment {} added'.format( deployment.identifier)) else: response = input('Deployment {} not found. Add? (y/n)\n') if response == 'y': deployment = Deployment(data) self.registry.save(deployment) LOGGER.warning('Deployment {} added'.format( deployment.identifier)) else: msg = 'Deployment {} not added. Skipping file.'.format( deployment.identifier) LOGGER.error(msg) raise ProcessingError(msg) LOGGER.info('Data record is valid and verified') if verify_only: # do not save or index LOGGER.info('Verification mode detected. NOT saving to registry') return True LOGGER.info('Saving data record CSV to registry') self.registry.save(self.data_record) LOGGER.info('Saving data record CSV to WAF') waf_filepath = self.data_record.get_waf_path(config.WDR_WAF_BASEDIR) os.makedirs(os.path.dirname(waf_filepath), exist_ok=True) shutil.copy2(self.data_record.ingest_filepath, waf_filepath) LOGGER.info('Indexing data record search engine') version = self.search_engine.get_record_version(self.data_record.es_id) if version: if version < self.data_record.data_generation_version: self.search_engine.index_data_record( self.data_record.__geo_interface__) else: self.search_engine.index_data_record( self.data_record.__geo_interface__) return True
def test_ecsv(self): """test Extended CSV handling""" # good file contents = util.read_file( resolve_test_data_path('data/20040709.ECC.2Z.2ZL1.NOAA-CMDL.csv')) ecsv = parser.ExtendedCSV(contents) self.assertIsInstance(ecsv, parser.ExtendedCSV) self.assertEqual('20040709.ECC.2Z.2ZL1.NOAA-CMDL.csv', ecsv.gen_woudc_filename()) # good file, missing instrument number contents = util.read_file( resolve_test_data_path('data/ecsv-missing-instrument-number.csv')) ecsv = parser.ExtendedCSV(contents) self.assertIsInstance(ecsv, parser.ExtendedCSV) self.assertEqual('20111101.Brewer.MKIII.na.RMDA.csv', ecsv.gen_woudc_filename()) # good file, space in instrument name contents = util.read_file( resolve_test_data_path('data/ecsv-space-in-instrument-name.csv')) ecsv = parser.ExtendedCSV(contents) self.assertEqual('20111101.Brewer-foo.MKIII.na.RMDA.csv', ecsv.gen_woudc_filename()) ecsv = parser.ExtendedCSV(contents) self.assertIsInstance(ecsv, parser.ExtendedCSV) self.assertTrue( set(DOMAINS['metadata_tables'].keys()).issubset( set(ecsv.extcsv.keys()))) ecsv.validate_metadata() # good file, test special characters contents = util.read_file( resolve_test_data_path('data/Brewer229_Daily_SEP2016.493')) ecsv = parser.ExtendedCSV(contents) self.assertIsInstance(ecsv, parser.ExtendedCSV) self.assertTrue( set(DOMAINS['metadata_tables'].keys()).issubset( set(ecsv.extcsv.keys()))) ecsv.validate_metadata() self.assertEqual(ecsv.extcsv['PLATFORM']['Name'], 'Río Gallegos') # bad file (not an ecsv) contents = util.read_file( resolve_test_data_path('data/not-an-ecsv.dat')) ecsv = parser.ExtendedCSV(contents) self.assertIsInstance(ecsv, parser.ExtendedCSV) with self.assertRaises(parser.NonStandardDataError): ecsv.validate_metadata() # bad file (missing table) contents = util.read_file( resolve_test_data_path('data/ecsv-missing-location-table.csv')) ecsv = parser.ExtendedCSV(contents) self.assertIsInstance(ecsv, parser.ExtendedCSV) with self.assertRaises(parser.MetadataValidationError): ecsv.validate_metadata() # bad file (missing data - LOCATION.Height) contents = util.read_file( resolve_test_data_path('data/ecsv-missing-location-height.csv')) ecsv = parser.ExtendedCSV(contents) self.assertIsInstance(ecsv, parser.ExtendedCSV) self.assertTrue( set(DOMAINS['metadata_tables'].keys()).issubset( set(ecsv.extcsv.keys()))) with self.assertRaises(parser.MetadataValidationError): ecsv.validate_metadata() # bad file (invalid location latitude) contents = util.read_file( resolve_test_data_path('data/ecsv-invalid-location-latitude.csv')) ecsv = parser.ExtendedCSV(contents) self.assertIsInstance(ecsv, parser.ExtendedCSV) with self.assertRaises(parser.MetadataValidationError): ecsv.validate_metadata() # bad file (invalid location longitude) contents = util.read_file( resolve_test_data_path('data/ecsv-invalid-location-longitude.csv')) ecsv = parser.ExtendedCSV(contents) self.assertIsInstance(ecsv, parser.ExtendedCSV) with self.assertRaises(parser.MetadataValidationError): ecsv.validate_metadata()
def orchestrate(source, working_dir, metadata_only=False, verify_only=False, bypass=False): """ Core orchestation workflow :param source: Path to input file or directory tree containing them. :param working_dir: Output directory for log and report files. :param metadata_only: `bool` of whether to verify only the common metadata tables. :param verify_only: `bool` of whether to verify the file for correctness without processing. :param bypass: `bool` of whether to skip permission prompts for adding new records. :returns: void """ files_to_process = [] if os.path.isfile(source): fullpath = Path(source).parent.resolve() parent_dir = os.path.basename(str(fullpath)) # Use parent dir to guess the contributor acronym during processing # runs, where the parent path is the contributor's FTP name. files_to_process = [(source, parent_dir)] elif os.path.isdir(source): for root, dirs, files in os.walk(source): parent_dir = os.path.basename(root) for f in files: fullpath = os.path.join(root, f) files_to_process.append((fullpath, parent_dir)) files_to_process.sort() passed = [] failed = [] registry = Registry() search_engine = SearchIndex() with OperatorReport(working_dir) as op_report, \ click.progressbar(files_to_process, label='Processing files') as run_: # noqa run_report = RunReport(working_dir) for file_to_process, contributor in run_: click.echo('Processing filename: {}'.format(file_to_process)) LOGGER.info('Detecting file') if not is_text_file(file_to_process): _, is_error = op_report.add_message(1) if is_error: op_report.write_failing_file(file_to_process, contributor) run_report.write_failing_file(file_to_process, contributor) failed.append(file_to_process) continue try: contents = read_file(file_to_process) LOGGER.info('Parsing data record') extcsv = ExtendedCSV(contents, op_report) LOGGER.info('Validating Extended CSV') extcsv.validate_metadata_tables() contributor = extcsv.extcsv['DATA_GENERATION']['Agency'] if not metadata_only: extcsv.validate_dataset_tables() LOGGER.info('Valid Extended CSV') p = Process(registry, search_engine, op_report) data_record = p.validate(extcsv, bypass=bypass, metadata_only=metadata_only) if data_record is None: click.echo('Not ingesting') failed.append(file_to_process) op_report.write_failing_file(file_to_process, contributor, extcsv) run_report.write_failing_file(file_to_process, contributor) else: data_record.ingest_filepath = file_to_process data_record.filename = os.path.basename(file_to_process) data_record.url = \ data_record.get_waf_path(config.WDR_WAF_BASEURL) data_record.output_filepath = \ data_record.get_waf_path(config.WDR_WAF_BASEDIR) if verify_only: click.echo('Verified but not ingested') else: p.persist() click.echo('Ingested successfully') op_report.write_passing_file(file_to_process, extcsv, data_record) run_report.write_passing_file(file_to_process, contributor) passed.append(file_to_process) except UnicodeDecodeError as err: LOGGER.error('Unknown file format: {}'.format(err)) click.echo('Not ingested') failed.append(file_to_process) op_report.write_failing_file(file_to_process, contributor) run_report.write_failing_file(file_to_process, contributor) except NonStandardDataError as err: LOGGER.error('Invalid Extended CSV: {}'.format(err.errors)) click.echo('Not ingested') failed.append(file_to_process) op_report.write_failing_file(file_to_process, contributor) run_report.write_failing_file(file_to_process, contributor) except MetadataValidationError as err: LOGGER.error('Invalid Extended CSV: {}'.format(err.errors)) click.echo('Not ingested') failed.append(file_to_process) op_report.write_failing_file(file_to_process, contributor) run_report.write_failing_file(file_to_process, contributor) except Exception as err: click.echo('Processing failed: {}'.format(err)) failed.append(file_to_process) op_report.write_failing_file(file_to_process, contributor) run_report.write_failing_file(file_to_process, contributor) registry.close_session() for name in files_to_process: if name in passed: click.echo('Pass: {}'.format(name)) elif name in failed: click.echo('Fail: {}'.format(name)) click.echo('({}/{} files passed)'.format(len(passed), len(files_to_process)))