def test_lines_to_ignore(self): # With explicit "0" metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(metaxml_section)) self.assertEqual(core_descriptor.lines_to_ignore, 0) # With explicit 1 metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(metaxml_section)) self.assertEqual(core_descriptor.lines_to_ignore, 1) # Implicit 0 (when nothing stated) metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(metaxml_section)) self.assertEqual(core_descriptor.lines_to_ignore, 0)
def test_lines_to_ignore(self): # With explicit "0" metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section)) self.assertEqual(core_descriptor.lines_to_ignore, 0) # With explicit 1 metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section)) self.assertEqual(core_descriptor.lines_to_ignore, 1) # Implicit 0 (when nothing stated) metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section)) self.assertEqual(core_descriptor.lines_to_ignore, 0)
def test_fields(self): metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section)) # .fields is supposed to return a list of dicts like those expected_fields = ( {"term": "http://rs.tdwg.org/dwc/terms/country", "index": None, "default": "Belgium"}, {"term": "http://rs.tdwg.org/dwc/terms/scientificName", "index": 1, "default": None}, ) for ef in expected_fields: self.assertTrue(ef in core_descriptor.fields) self.assertEqual(len(core_descriptor.fields), 5)
def test_headers_unordered(self): metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Taxon"> <files> <location>taxon.txt</location> </files> <field index="4" term="http://rs.tdwg.org/dwc/terms/phylum"/> <id index="0" /> <field index="1" term="http://rs.tdwg.org/dwc/terms/order"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/class"/> <field index="6" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/kingdom"/> <field index="5" term="http://rs.tdwg.org/dwc/terms/genus"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section)) expected_headers_core = [ "id", "http://rs.tdwg.org/dwc/terms/order", "http://rs.tdwg.org/dwc/terms/class", "http://rs.tdwg.org/dwc/terms/kingdom", "http://rs.tdwg.org/dwc/terms/phylum", "http://rs.tdwg.org/dwc/terms/genus", "http://rs.tdwg.org/dwc/terms/family", ] self.assertEqual(core_descriptor.headers, expected_headers_core)
def test_headers_unordered(self): metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Taxon"> <files> <location>taxon.txt</location> </files> <field index="4" term="http://rs.tdwg.org/dwc/terms/phylum"/> <id index="0" /> <field index="1" term="http://rs.tdwg.org/dwc/terms/order"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/class"/> <field index="6" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/kingdom"/> <field index="5" term="http://rs.tdwg.org/dwc/terms/genus"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(metaxml_section)) expected_headers_core = [ 'id', 'http://rs.tdwg.org/dwc/terms/order', 'http://rs.tdwg.org/dwc/terms/class', 'http://rs.tdwg.org/dwc/terms/kingdom', 'http://rs.tdwg.org/dwc/terms/phylum', 'http://rs.tdwg.org/dwc/terms/genus', 'http://rs.tdwg.org/dwc/terms/family' ] self.assertEqual(core_descriptor.headers, expected_headers_core)
def test_short_headers(self): metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(metaxml_section)) expected_short_headers_core = [ 'id', 'scientificName', 'basisOfRecord', 'family', 'locality' ] self.assertEqual(core_descriptor.short_headers, expected_short_headers_core)
def test_fields(self): metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(metaxml_section)) # .fields is supposed to return a list of dicts like those expected_fields = ({ 'term': 'http://rs.tdwg.org/dwc/terms/country', 'index': None, 'default': 'Belgium' }, { 'term': 'http://rs.tdwg.org/dwc/terms/scientificName', 'index': 1, 'default': None }) for ef in expected_fields: self.assertTrue(ef in core_descriptor.fields) self.assertEqual(len(core_descriptor.fields), 5)
def test_headers_defaultvalue(self): """ Ensure headers work properly when confronted to default values (w/o column in file)""" metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section)) expected_headers_core = [ "id", "http://rs.tdwg.org/dwc/terms/scientificName", "http://rs.tdwg.org/dwc/terms/basisOfRecord", "http://rs.tdwg.org/dwc/terms/family", "http://rs.tdwg.org/dwc/terms/locality", ] self.assertEqual(core_descriptor.headers, expected_headers_core)
def test_lines_to_ignore_attribute(self): """.lines_to_ignore works as documented""" metaxml_section = r""" <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/> </core> """ descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(metaxml_section)) data_file = CSVDataFile(sample_data_path('dwca-simple-dir'), descriptor) self.assertEqual(data_file.lines_to_ignore, 1) metaxml_section = r""" <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="3" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/> </core> """ descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(metaxml_section)) data_file = CSVDataFile(sample_data_path('dwca-simple-dir'), descriptor) self.assertEqual(data_file.lines_to_ignore, 3)
def test_exposes_coreid_index_of_extensions(self): ext_section = """ <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description"> <files><location>description.txt</location></files> <coreid index="0" /> <field index="1" term="http://purl.org/dc/terms/type"/> <field index="2" term="http://purl.org/dc/terms/language"/> <field index="3" term="http://purl.org/dc/terms/description"/> </extension> """ ext_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(ext_section)) self.assertEqual(ext_descriptor.coreid_index, 0) # ... but it doesn't have .id_index (only for core!) self.assertIsNone(ext_descriptor.id_index)
def test_content_raw_element_tag(self): """ Test the content of raw_element seems decent. """ ext_section = """ <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description"> <files><location>description.txt</location></files> <coreid index="0" /> <field index="1" term="http://purl.org/dc/terms/type"/> <field index="2" term="http://purl.org/dc/terms/language"/> <field index="3" term="http://purl.org/dc/terms/description"/> </extension> """ ext_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(ext_section)) self.assertEqual(ext_descriptor.raw_element.tag, "extension") self.assertEqual(ext_descriptor.raw_element.get("encoding"), "utf-8") self.assertEqual(len(ext_descriptor.raw_element.findall("field")), 3)
def test_exposes_coreid_index_of_extensions(self): ext_section = """ <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description"> <files><location>description.txt</location></files> <coreid index="0" /> <field index="1" term="http://purl.org/dc/terms/type"/> <field index="2" term="http://purl.org/dc/terms/language"/> <field index="3" term="http://purl.org/dc/terms/description"/> </extension> """ ext_descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(ext_section)) self.assertEqual(ext_descriptor.coreid_index, 0) # ... but it doesn't have .id_index (only for core!) self.assertIsNone(ext_descriptor.id_index)
def test_content_raw_element_tag(self): """ Test the content of raw_element seems decent. """ ext_section = """ <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description"> <files><location>description.txt</location></files> <coreid index="0" /> <field index="1" term="http://purl.org/dc/terms/type"/> <field index="2" term="http://purl.org/dc/terms/language"/> <field index="3" term="http://purl.org/dc/terms/description"/> </extension> """ ext_descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(ext_section)) self.assertEqual(ext_descriptor.raw_element.tag, 'extension') self.assertEqual(ext_descriptor.raw_element.get('encoding'), 'utf-8') self.assertEqual(len(ext_descriptor.raw_element.findall('field')), 3)
def test_iterate(self): metaxml_section = r""" <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files><location>occurrence.txt</location></files> <id index="0" /> <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/> </core> """ descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(metaxml_section)) data_file = CSVDataFile(sample_data_path("dwca-simple-dir"), descriptor) for row in data_file: self.assertIsInstance(row, str)
def test_file_details(self): metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section)) self.assertEqual(core_descriptor.file_location, "occurrence.txt") self.assertEqual(core_descriptor.file_encoding, "utf-8")
def test_close(self): metaxml_section = r""" <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files><location>occurrence.txt</location></files> <id index="0" /> <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/> </core> """ descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(metaxml_section)) data_file = CSVDataFile(DIRECTORY_ARCHIVE_PATH, descriptor) data_file.close() with self.assertRaises(ValueError): # It's not possible anymore to access the data because file has been closed. data_file.get_row_by_position(1)
def test_file_descriptor_attribute(self): """The instance of DataFileDescriptor which is passed to the constructor is available in .file_descriptor""" metaxml_section = r""" <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/> </core> """ descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(metaxml_section)) data_file = CSVDataFile(DIRECTORY_ARCHIVE_PATH, descriptor) self.assertEqual(data_file.file_descriptor, descriptor)
def test_exposes_id_index_of_core(self): metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section)) self.assertEqual(core_descriptor.id_index, 0) # ... but it doesn't have .coreid_index (only for extensions!) self.assertIsNone(core_descriptor.coreid_index)
def test_tell_if_represents_core(self): # 1. Test with core with DwCAReader(BASIC_ARCHIVE_PATH) as dwca: core_descriptor = dwca.descriptor.core self.assertTrue(core_descriptor.represents_corefile) self.assertFalse(core_descriptor.represents_extension) ext_section = """ <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description"> <files><location>description.txt</location></files> <coreid index="0" /> <field index="1" term="http://purl.org/dc/terms/type"/> <field index="2" term="http://purl.org/dc/terms/language"/> <field index="3" term="http://purl.org/dc/terms/description"/> </extension> """ # 2. And with extension ext_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(ext_section)) self.assertFalse(ext_descriptor.represents_corefile) self.assertTrue(ext_descriptor.represents_extension)
def test_tell_if_represents_core(self): # 1. Test with core with DwCAReader(BASIC_ARCHIVE_PATH) as dwca: core_descriptor = dwca.descriptor.core self.assertTrue(core_descriptor.represents_corefile) self.assertFalse(core_descriptor.represents_extension) ext_section = """ <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description"> <files><location>description.txt</location></files> <coreid index="0" /> <field index="1" term="http://purl.org/dc/terms/type"/> <field index="2" term="http://purl.org/dc/terms/language"/> <field index="3" term="http://purl.org/dc/terms/description"/> </extension> """ # 2. And with extension ext_descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(ext_section)) self.assertFalse(ext_descriptor.represents_corefile) self.assertTrue(ext_descriptor.represents_extension)
def test_exposes_id_index_of_core(self): metaxml_section = """ <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/> <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/> </core> """ core_descriptor = DataFileDescriptor.make_from_metafile_section( ET.fromstring(metaxml_section)) self.assertEqual(core_descriptor.id_index, 0) # ... but it doesn't have .coreid_index (only for extensions!) self.assertIsNone(core_descriptor.coreid_index)