def test_manual_cleanup_zipped(self): """Test no temporary files are left after execution (calling close() manually).""" num_files_before = len(os.listdir(".")) r = DwCAReader(BASIC_ARCHIVE_PATH) r.close() num_files_after = len(os.listdir(".")) self.assertEqual(num_files_before, num_files_after)
def test_exception_invalid_archives_missing_metadata(self): """Ensure an exception is raised when referencing a missing metadata file.""" # Sometimes, the archive metafile references a metadata file that's not present in the # archive. See for example http://dev.gbif.org/issues/browse/PF-2125 with self.assertRaises(InvalidArchive) as cm: a = DwCAReader(INVALID_LACKS_METADATA) a.close() the_exception = cm.exception expected_message = "eml.xml is referenced in the archive descriptor but missing." self.assertEqual(str(the_exception), expected_message)
def dwca_metadata(dwca_file): """Open a Darwin Core archive and return the metadata.""" # Open the Darwin Core Archive given in dwca_file dwca = DwCAReader(dwca_file) if not dwca: return None # Pull the metadata from the archive metadata=dwca.metadata # Close the archive to free resources dwca.close() return metadata
def test_pd_read_default_values(self): with DwCAReader(sample_data_path('dwca-test-default.zip')) as dwca: df = dwca.pd_read('occurrence.txt') self.assertIn('country', df.columns.values.tolist()) for country in df['country'].values.tolist(): self.assertEqual(country, 'Belgium')
def whip_dwca(dwca_zip, specifications, maxentries=None): """Whip a Darwin Core Archive Validate the core file of a `Darwin Core Archive`_ zipped data set, using the :class:`~dwca.read.DwCAReader` reading and iterator capabilities. .. _Darwin Core Archive: https://en.wikipedia.org/wiki/Darwin_Core_Archive Parameters ---------- dwca_zip : str Filename of the zipped Darwin Core Archive. specifications : dict Valid specifications whip dictionary schema. maxentries : int Define the limit of records to validate from the Archive, useful to have a quick set on the frst subset of data. Returns ------- whip_it : pywhip.pywhi.Whip Whip validator clasc instance, containing the errors and reporting capabilities. """ # Extract data header - only core support with DwCAReader(dwca_zip) as dwca: field_names = [field['term'].split('/')[-1] for field in dwca.core_file.file_descriptor.fields] # Apply whip whip_it = Whip(specifications) whip_it._whip(whip_it.generate_dwca(dwca_zip), field_names, maxentries) return whip_it
def test_no_temporary_dir_directory(self): """If archive is a directory, no need to create temporary files.""" num_files_before = len(os.listdir(".")) with DwCAReader(sample_data_path("dwca-simple-dir")): num_files_during = len(os.listdir(".")) self.assertEqual(num_files_before, num_files_during)
def test_get_corerow_by_id_other(self): genus_qn = "http://rs.tdwg.org/dwc/terms/genus" with DwCAReader(sample_data_path("dwca-ids.zip")) as dwca: # Passed as an integer, conversion will be tried... r = dwca.get_corerow_by_id(3) self.assertEqual("Peliperdix", r.data[genus_qn])
def test_csv_quote_dir_archive(self): """If the field separator is in a quoted field, don't break on it.""" with DwCAReader(sample_data_path("dwca-csv-quote-dir")) as dwca: rows = list(dwca) self.assertEqual(len(rows), 2) self.assertEqual(rows[0].data[qn("basisOfRecord")], "Observation, something")
def load_rows(self): with DwCAReader(self.gbif_path) as dwca: # We can now interact with the 'dwca' object print("Read core type: " + dwca.descriptor.core.type.__str__() + "! :)") # Check if a Darwin Core term in present in the core file if 'http://rs.tdwg.org/dwc/terms/locality' in dwca.descriptor.core.terms: print("Locality term is present! :)") else: print("Locality term is not present. :(") # Using full qualnames for DarwincCore terms (such as 'http://rs.tdwg.org/dwc/terms/country') is verbose... # The qualname() helper function make life easy for common terms. # (here, it has been imported as 'qn'): qn('locality') # => u'http://rs.tdwg.org/dwc/terms/locality' # Combined with previous examples, this can be used to things more clear: # For example: if qn('locality') in dwca.descriptor.core.terms: pass # Or: if dwca.descriptor.core.type == qn('Occurrence'): pass # load row data into memory self.gbif = dwca.rows
def test_core_file(self): with DwCAReader( sample_data_path("dwca-simple-test-archive.zip")) as dwca: self.assertIsInstance(dwca.core_file, CSVDataFile) # Quick content check just to be sure self.assertEqual(dwca.core_file.lines_to_ignore, 1)
def test_pd_read_default_values(self): with DwCAReader(sample_data_path("dwca-test-default.zip")) as dwca: df = dwca.pd_read("occurrence.txt") self.assertIn("country", df.columns.values.tolist()) for country in df["country"].values.tolist(): self.assertEqual(country, "Belgium")
def test_exposes_core_terms(self): with DwCAReader( sample_data_path('dwca-star-test-archive.zip')) as star_dwca: # The Core file contains the following rows # <field index="1" term="http://rs.tdwg.org/dwc/terms/family"/> # <field index="2" term="http://rs.tdwg.org/dwc/terms/phylum"/> # <field index="3" term="http://rs.tdwg.org/dwc/terms/order"/> # <field index="4" term="http://rs.tdwg.org/dwc/terms/genus"/> # <field index="5" term="http://rs.tdwg.org/dwc/terms/kingdom"/> # <field index="6" term="http://rs.tdwg.org/dwc/terms/class"/> # It also contains an id column (should not appear here) # There's an extension with 3 fields, should not appear here. # Assert correct size descriptor = star_dwca.descriptor self.assertEqual(6, len(descriptor.core.terms)) # Assert correct content (should be a set, so unordered) fields = set([ 'http://rs.tdwg.org/dwc/terms/kingdom', 'http://rs.tdwg.org/dwc/terms/order', 'http://rs.tdwg.org/dwc/terms/class', 'http://rs.tdwg.org/dwc/terms/genus', 'http://rs.tdwg.org/dwc/terms/family', 'http://rs.tdwg.org/dwc/terms/phylum' ]) self.assertEqual(fields, descriptor.core.terms)
def test_get_corerow_by_id_string(self): genus_qn = 'http://rs.tdwg.org/dwc/terms/genus' with DwCAReader(sample_data_path('dwca-ids.zip')) as dwca: # Number can be passed as a string.... r = dwca.get_corerow_by_id('3') self.assertEqual('Peliperdix', r.data[genus_qn])
def test_get_corerow_by_id_string(self): genus_qn = "http://rs.tdwg.org/dwc/terms/genus" with DwCAReader(sample_data_path("dwca-ids.zip")) as dwca: # Number can be passed as a string.... r = dwca.get_corerow_by_id("3") self.assertEqual("Peliperdix", r.data[genus_qn])
def test_deprecated_row_by_position(self): """get_row_by_index() has been renamed get_corerow_by_position(). Make sure it still works, w/ warning.""" with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DeprecationWarning) # Copy-pasted code from the long term test_get_corerow_by_position() with DwCAReader(sample_data_path('dwca-ids.zip')) as dwca: # Row IDs are ordered like this in core: id 4-1-3-2 first_row = dwca.get_row_by_index(0) self.assertEqual(4, int(first_row.id)) self.assertEqual(1, len(w)) # Warning was issued the_warning = w[0] assert issubclass(the_warning.category, DeprecationWarning) self.assertEqual("This method has been renamed to get_corerow_by_position().", str(the_warning.message)) last_row = dwca.get_row_by_index(3) self.assertEqual(2, int(last_row.id)) # Exception raised if bigger than archive (last index: 3) with self.assertRaises(RowNotFound): dwca.get_row_by_index(4) with self.assertRaises(RowNotFound): dwca.get_row_by_index(1000)
def test_pd_read_no_data_files(self): with DwCAReader(sample_data_path('dwca-simple-test-archive.zip')) as dwca: with self.assertRaises(NotADataFile): dwca.pd_read('imaginary_file.txt') with self.assertRaises(NotADataFile): dwca.pd_read('eml.xml')
def test_dont_enclose_unenclosed(self): """If fields_enclosed_by is set to an empty string, don't enclose (even if quotes are present)""" with DwCAReader(sample_data_path('dwca-simple-dir')) as dwca: rows = list(dwca) self.assertEqual('"betta" splendens', rows[2].data[qn('scientificName')]) self.assertEqual("'betta' splendens", rows[3].data[qn('scientificName')])
def test_explicit_encoding_metadata(self): """If the metadata file explicitly specifies encoding (<xml ...>), make sure it is used.""" with DwCAReader(sample_data_path('dwca-metadata-windows1252-encoding')) as dwca: v = (dwca.metadata.find('dataset').find('creator').find('individualName') .find('surName').text) self.assertEqual(v, u'Noé') # Is the accent properly interpreted?
def test_read_core_value(self): """Retrieve a simple value from core file""" with DwCAReader(sample_data_path('dwca-simple-test-archive.zip')) as dwca: rows = list(dwca) # Check basic locality values from sample file self.assertEqual('Borneo', rows[0].data[qn('locality')]) self.assertEqual('Mumbai', rows[1].data[qn('locality')])
def test_read_core_value(self): """Retrieve a simple value from core file""" with DwCAReader(BASIC_ARCHIVE_PATH) as dwca: rows = list(dwca) # Check basic locality values from sample file self.assertEqual('Borneo', rows[0].data[qn('locality')]) self.assertEqual('Mumbai', rows[1].data[qn('locality')])
def test_orphaned_extension_rows(self): # Archive with extensions and orphaned rows with DwCAReader(sample_data_path("dwca-orphaned-rows.zip")) as dwca: expected = { "description.txt": {u"5": [3, 4], u"6": [5]}, "vernacularname.txt": {u"7": [4]}, } self.assertEqual(expected, dwca.orphaned_extension_rows())
def test_use_extensions(self): """Ensure the .use_extensions attribute of DwCAReader works as intended.""" with DwCAReader(sample_data_path('dwca-simple-test-archive.zip')) as dwca: self.assertFalse(dwca.use_extensions) # Basic archive without extensions with DwCAReader(sample_data_path('dwca-simple-csv.zip')) as dwca: # Just a CSV file, so no extensions self.assertFalse(dwca.use_extensions) with DwCAReader(sample_data_path('dwca-star-test-archive.zip')) as dwca: self.assertTrue(dwca.use_extensions) with DwCAReader(sample_data_path('dwca-2extensions.zip')) as dwca: self.assertTrue(dwca.use_extensions) with DwCAReader(sample_data_path('dwca-star-test-archive.zip'), extensions_to_ignore="vernacularname.txt") as dwca: # We ignore the extension, so archive appears without self.assertFalse(dwca.use_extensions)
def test_partial_default(self): with DwCAReader(sample_data_path("dwca-partial-default.zip")) as dwca: self.assertEqual( dwca.rows[0].data[qn("country")], "France" ) # Value comes from data file self.assertEqual( dwca.rows[1].data[qn("country")], "Belgium" ) # Value is field default
def test_row_class(self): with DwCAReader(sample_data_path('dwca-star-test-archive.zip')) as star_dwca: for row in star_dwca: self.assertIsInstance(row, CoreRow) # But the extensions are... extensions (hum) for an_extension in row.extensions: self.assertIsInstance(an_extension, ExtensionRow)
def test_open_included_file(self): """Ensure DwCAReader.open_included_file work as expected.""" # Let's use it to read the raw core data file: with DwCAReader(DIRECTORY_ARCHIVE_PATH) as dwca: f = dwca.open_included_file('occurrence.txt') raw_occ = f.read() self.assertTrue(raw_occ.endswith('betta splendens\n'))
def test_orphaned_extension_rows(self): # Archive with extensions and orphaned rows with DwCAReader(sample_data_path('dwca-orphaned-rows.zip')) as dwca: expected = { 'description.txt': {u'5': [3, 4], u'6': [5]}, 'vernacularname.txt': {u'7': [4]} } self.assertEqual(expected, dwca.orphaned_extension_rows())
def test_row_class(self): with DwCAReader(EXTENSION_ARCHIVE_PATH) as star_dwca: for row in star_dwca: self.assertIsInstance(row, CoreRow) # But the extensions are... extensions (hum) for an_extension in row.extensions: self.assertIsInstance(an_extension, ExtensionRow)
def test_pd_read_quotedir(self): with DwCAReader(sample_data_path("dwca-csv-quote-dir")) as dwca: df = dwca.pd_read("occurrence.txt") # The field separator is found in a quoted field, don't break self.assertEqual(df.shape, (2, 5)) self.assertEqual( df["basisOfRecord"].values.tolist()[0], "Observation, something" )
def test_position(self): # Test with archives with and without headers: archives_to_test = (BASIC_ARCHIVE_PATH, NOHEADERS1_PATH) for archive_path in archives_to_test: with DwCAReader(archive_path) as dwca: for i, row in enumerate(dwca): self.assertEqual(i, row.position)
def test_open_included_file(self): """Ensure DwCAReader.open_included_file work as expected.""" # Let's use it to read the raw core data file: with DwCAReader(sample_data_path('dwca-simple-dir')) as dwca: f = dwca.open_included_file('occurrence.txt') raw_occ = f.read() self.assertTrue(raw_occ.endswith("'betta' splendens\n"))
def test_simplecsv_archive(self): """Ensure the reader works with archives consiting of a single CSV file. As described in page #2 of http://www.gbif.org/resource/80639, those archives consists of a single core data file where the first line provides the names of the Darwin Core terms represented in the published data. That also seems to match quite well the definition of Simple Darwin Core expressed as text: http://rs.tdwg.org/dwc/terms/simple/index.htm. """ with DwCAReader(sample_data_path('dwca-simple-csv.zip')) as dwca: # Ensure we get the correct number of rows self.assertEqual(len(dwca.rows), 3) # Ensure we can access arbitrary data self.assertEqual( dwca.get_corerow_by_position(1).data['decimallatitude'], '-31.98333') # Archive descriptor should be None self.assertIsNone(dwca.descriptor) # (scientific) metadata should be None self.assertIsNone(dwca.metadata) # Let's do the same tests again but with DOS line endings in the data file with DwCAReader(sample_data_path('dwca-simple-csv-dos.zip')) as dwca: # Ensure we get the correct number of rows self.assertEqual(len(dwca.rows), 3) # Ensure we can access arbitrary data self.assertEqual( dwca.get_corerow_by_position(1).data['decimallatitude'], '-31.98333') # Archive descriptor should be None self.assertIsNone(dwca.descriptor) # (scientific) metadata should be None self.assertIsNone(dwca.metadata) # And with a file where fields are not double quotes-enclosed: with DwCAReader( sample_data_path('dwca-simple-csv-notenclosed.zip')) as dwca: # Ensure we get the correct number of rows self.assertEqual(len(dwca.rows), 3) # Ensure we can access arbitrary data self.assertEqual( dwca.get_corerow_by_position(1).data['decimallatitude'], '-31.98333') # Archive descriptor should be None self.assertIsNone(dwca.descriptor) # (scientific) metadata should be None self.assertIsNone(dwca.metadata)
def test_ignore_extension(self): """Ensure the extensions_to_ignore argument work as expected.""" # This archive has two extensions, but we ask to ignore one... with DwCAReader( sample_data_path("dwca-2extensions.zip"), extensions_to_ignore="description.txt", ) as multi_dwca: rows = list(multi_dwca) # 3 vernacular names self.assertEqual(3, len(rows[0].extensions)) # 1 Vernacular name self.assertEqual(1, len(rows[1].extensions)) # No extensions for this core line self.assertEqual(0, len(rows[2].extensions)) # Here, we ignore the only extension of an archive with DwCAReader( sample_data_path("dwca-star-test-archive.zip"), extensions_to_ignore="vernacularname.txt", ) as star_dwca: rows = list(star_dwca) self.assertEqual(0, len(rows[0].extensions)) self.assertEqual(0, len(rows[1].extensions)) self.assertEqual(0, len(rows[2].extensions)) self.assertEqual(0, len(rows[3].extensions)) # And here, we check it is silently ignored and everything works in case we ask to # ignore an unexisting extension with DwCAReader( sample_data_path("dwca-2extensions.zip"), extensions_to_ignore="helloworld.txt", ) as multi_dwca: rows = list(multi_dwca) # 3 vernacular names + 2 taxon descriptions self.assertEqual(5, len(rows[0].extensions)) # 1 Vernacular name, no taxon description self.assertEqual(1, len(rows[1].extensions)) # No extensions for this core row self.assertEqual(0, len(rows[2].extensions))
def test_auto_cleanup_directory(self): """If the source is already a directory, there's nothing to create nor cleanup.""" num_files_before = len(os.listdir('.')) with DwCAReader(sample_data_path('dwca-simple-dir')): pass num_files_after = len(os.listdir('.')) self.assertEqual(num_files_before, num_files_after)
def test_source_data_not_destroyed_directory(self): """In archive=directory, it should not be destroyed after use. (check that the cleanup routine for zipped file is not accidentaly called) """ r = DwCAReader(DIRECTORY_ARCHIVE_PATH) r.close() # If previously destroyed, this will fail... r = DwCAReader(DIRECTORY_ARCHIVE_PATH) self.assertIsInstance(r.metadata, ET.Element) r.close()
def test_exception_invalid_simple_archives(self): """Ensure an exception is raised when simple archives can't be interpreted. When there's no metafile in an archive, this one consists of a single data core file, and possibly some metadata in EML.xml. If the archive doesn't follow this structure, python-dwca-reader can't detect the data file and should throw an InvalidArchive exception. """ # There's a random file (in addition to data and EML.xml) in this one, so we can't choose # which file is the datafile. with self.assertRaises(InvalidArchive): a = DwCAReader(INVALID_SIMPLE_TOOMUCH) a.close() with self.assertRaises(InvalidArchive): a = DwCAReader(INVALID_SIMPLE_TWO) a.close()
fullpath = tsvoutputfile if not os.path.isfile(inputfile): return None # Make an appropriate reader based on whether the archive is standard or a GBIF # download. dwcareader = None if type=='gbif': try: dwcareader = GBIFResultsReader(inputfile) except Exception, e: logging.error('GBIF archive %s has an exception: %s ' % (inputfile, e)) pass else: dwcareader = DwCAReader(inputfile) if dwcareader is None: print 'No viable archive found at %s' % inputfile return None termnames=list(dwcareader.descriptor.core.terms) shorttermnames=short_term_names(termnames) dialect = csv.excel dialect.lineterminator='\r' dialect.delimiter='\t' with open(fullpath, 'w') as tsvfile: writer = csv.DictWriter(tsvfile, dialect=dialect, fieldnames=shorttermnames, quoting=csv.QUOTE_NONE, quotechar='') writer.writeheader() rowcount = 0
def test_classic_opening(self): """Ensure it also works w/o the 'with' statement.""" dwca = DwCAReader(BASIC_ARCHIVE_PATH) self.assertIsInstance(dwca.metadata, ET.Element) dwca.close()