def test_row_source_missing_metadata(self):
     with GBIFResultsReader(
             sample_data_path(
                 'gbif-results-lacks-s-metadata.zip')) as results:
         # We have source metadata, but not for all datasets/line...
         # We sould have None in this cases
         first_row = results.get_corerow_by_id('607759330')
         self.assertEqual(None, first_row.source_metadata)
Exemplo n.º 2
0
    def test_dwcareader_features(self):
        """Ensure we didn't break inherited basic DwCAReader features."""
        with GBIFResultsReader(GBIF_RESULTS_PATH) as results_dwca:
            self.assertEqual(158, len(results_dwca.rows))
            self.assertEqual('http://rs.tdwg.org/dwc/terms/Occurrence',
                             results_dwca.descriptor.core.type)

            row1 = results_dwca.rows[0]
            self.assertEqual('Tetraodontidae', row1.data[qn('family')])
            self.assertEqual([], row1.extensions)
    def test_row_human_representation(self):
        with DwCAReader(sample_data_path(
                'dwca-simple-test-archive.zip')) as basic_dwca:
            l = basic_dwca.rows[0]
            l_repr = str(l)
            self.assertIn("Rowtype: http://rs.tdwg.org/dwc/terms/Occurrence",
                          l_repr)
            self.assertIn("Source: Core file", l_repr)
            self.assertIn("Row id:", l_repr)
            self.assertIn("Reference extension rows: No", l_repr)
            self.assertIn("Reference source metadata: No", l_repr)

            if sys.version_info[0] == 2:  # Python 2
                self.assertIn(
                    "http://rs.tdwg.org/dwc/terms/scientificName': u'tetraodon fluviatilis'",
                    l_repr)
            else:
                self.assertIn(
                    "http://rs.tdwg.org/dwc/terms/scientificName': 'tetraodon fluviatilis'",
                    l_repr)

        with DwCAReader(
                sample_data_path('dwca-star-test-archive.zip')) as star_dwca:
            l = star_dwca.rows[0]
            l_repr = str(l)
            self.assertIn("Rowtype: http://rs.tdwg.org/dwc/terms/Taxon",
                          l_repr)
            self.assertIn("Source: Core file", l_repr)
            self.assertIn("Row id: 1", l_repr)
            self.assertIn("Reference extension rows: Yes", l_repr)
            self.assertIn("Reference source metadata: No", l_repr)

            extension_l_repr = str(l.extensions[0])
            self.assertIn(
                "Rowtype: http://rs.gbif.org/terms/1.0/VernacularName",
                extension_l_repr)
            self.assertIn("Source: Extension file", extension_l_repr)
            self.assertIn("Core row id: 1", extension_l_repr)
            self.assertIn("ostrich", extension_l_repr)
            self.assertIn("Reference extension rows: No", extension_l_repr)
            self.assertIn("Reference source metadata: No", extension_l_repr)

        with GBIFResultsReader(
                sample_data_path('gbif-results.zip')) as gbif_dwca:
            l = gbif_dwca.rows[0]
            l_repr = str(l)

            self.assertIn("Rowtype: http://rs.tdwg.org/dwc/terms/Occurrence",
                          l_repr)
            self.assertIn("Source: Core file", l_repr)
            self.assertIn("Reference source metadata: Yes", l_repr)
Exemplo n.º 4
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    options = _getoptions()
    if options.dwca_file is None:
        print 'syntax: dwca_utils.py -f dwca_file [-v vocab_path] [-t archive_type]'
        return

    # Make an appropriate reader based on whether the archive is standard or a GBIF
    # download.
    dwcareader = None
    if options.archive_type == 'gbif':
        try:
            dwcareader = GBIFResultsReader(options.dwca_file)
        except Exception, e:
            logging.error('GBIF archive %s has an exception: %s ' %
                          (options.dwca_file, e))
def dwca_metadata_from_file(inputfile, archivetype=None):
    ''' Return metadata from a Darwin Core Archive file.'''
    if inputfile is None or len(inputfile.strip()) == 0:
        return None

    # Make an appropriate reader based on whether the archive is standard or a GBIF
    # download.
    dwcareader = None
    if archivetype is not None and archivetype.lower() == 'gbif':
        try:
            dwcareader = GBIFResultsReader(inputfile)
        except Exception, e:
            s = 'Unable to read GBIF archive %s. %s %s' % (inputfile, e,
                                                           __version__)
            logging.error(s)
            pass
Exemplo n.º 6
0
    def test_row_source_metadata(self):
        with GBIFResultsReader(GBIF_RESULTS_PATH) as results:
            first_row = results.get_corerow_by_id('607759330')
            m = first_row.source_metadata

            self.assertIsInstance(m, ET.Element)

            v = (m.find('dataset').find('creator').find('individualName').find(
                'givenName').text)

            self.assertEqual(v, 'Stanley')

            last_row = results.get_corerow_by_id('782700656')
            m = last_row.source_metadata

            self.assertIsInstance(m, ET.Element)
            v = m.find('dataset').find('language').text
            self.assertEqual(v, 'en')
def dwca_core_to_tsv():
    """Save the core of the archive to a csv file with short DwC term names as headers."""
    inputfile = dwcafile
    fullpath = tsvoutputfile

    if not os.path.isfile(inputfile):
        return None

    # Make an appropriate reader based on whether the archive is standard or a GBIF
    # download.
    dwcareader = None
    if type == 'gbif':
        try:
            dwcareader = GBIFResultsReader(inputfile)
        except Exception, e:
            logging.error('GBIF archive %s has an exception: %s ' %
                          (inputfile, e))
            pass
Exemplo n.º 8
0
    def test_source_metadata(self):
        with GBIFResultsReader(GBIF_RESULTS_PATH) as results:
            # We have 23 EML files in dataset/
            self.assertEqual(23, len(results.source_metadata))
            # Assert a key is present
            self.assertTrue('eccf4b09-f0c8-462d-a48c-41a7ce36815a' in
                            results.source_metadata)

            self.assertFalse('incorrect-UUID' in results.source_metadata)

            # Assert it's the correct EML file (content!)
            sm = results.source_metadata
            metadata = sm['eccf4b09-f0c8-462d-a48c-41a7ce36815a']

            self.assertIsInstance(metadata, ET.Element)

            # Assert we can read basic fields from EML:
            self.assertEqual(
                metadata.find('dataset').find('creator').find(
                    'individualName').find('givenName').text, 'Rob')
def dwca_core_to_tsv(options):
    ''' Save the core of the archive to a tsv file with DwC term names as headers.
    options - a dictionary of parameters
        loglevel - the level at which to log (e.g., DEBUG)
        workspace - path to a directory for the outputfile (optional)
        inputfile - full path to the input Darwin Core archive file (required)
        outputfile - file name of the tsv output file, no path (optional)
        archivetype - archive type ('standard' or 'gbif') (optional; default 'standard')
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output tsv file
        rowcount - the number of rows in the Darwin Core archive file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = [
        'workspace', 'outputfile', 'rowcount', 'success', 'message',
        'artifacts'
    ]

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    rowcount = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    outputfile = None
    archivetype = 'standard'

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [
            workspace, outputfile, rowcount, success, message, artifacts
        ]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Look to see if the input file is at the absolute path or in the workspace.
    if os.path.isfile(inputfile) == False:
        if os.path.isfile(workspace + '/' + inputfile) == True:
            inputfile = workspace + '/' + inputfile
        else:
            message = 'Input file %s not found. %s' % (inputfile, __version__)
            returnvals = [
                workspace, outputfile, rowcount, success, message, artifacts
            ]
            logging.debug('message:\n%s' % message)
            return response(returnvars, returnvals)

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile) == 0:
        outputfile = 'dwca_%s.txt' % str(uuid.uuid1())
    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    try:
        archivetype = options['archivetype']
    except:
        pass

    # Note: The DwCAReader creates a temporary directory of its own and cleans it up
    # Make a reader based on whether the archive is standard or a GBIF download.
    dwcareader = None
    if archivetype is not None and archivetype.lower() == 'gbif':
        try:
            with GBIFResultsReader(inputfile) as dwcareader:
                rowcount = write_core_csv_file(dwcareader, outputfile)

        except Exception, e:
            message = 'Error %s ' % e
            message += 'reading GBIF archive: %s. %s' % (inputfile,
                                                         __version__)
            returnvals = [
                workspace, outputfile, rowcount, success, message, artifacts
            ]
            logging.debug('message:\n%s' % message)
            return response(returnvars, returnvals)
Exemplo n.º 10
0
 def test_row_source_missing_metadata(self):
     with GBIFResultsReader(MISSINGMETA_PATH) as results:
         # We have source metadata, but not for all datasets/line...
         # We sould have None in this cases
         first_row = results.get_corerow_by_id('607759330')
         self.assertEqual(None, first_row.source_metadata)
Exemplo n.º 11
0
 def test_rights_access(self):
     """Check the content of rights.txt is accessible."""
     with GBIFResultsReader(GBIF_RESULTS_PATH) as results_dwca:
         self.assertEqual(self.RIGHTS_CONTENT, results_dwca.rights)
Exemplo n.º 12
0
 def test_citations_access(self):
     """Check the content of citations.txt is accessible."""
     with GBIFResultsReader(GBIF_RESULTS_PATH) as results_dwca:
         self.assertEqual(self.CITATIONS_CONTENT, results_dwca.citations)
 def test_rights_access(self):
     """Check the content of rights.txt is accessible."""
     with GBIFResultsReader(
             sample_data_path('gbif-results.zip')) as results_dwca:
         self.assertEqual(self.RIGHTS_CONTENT, results_dwca.rights)
 def test_citations_access(self):
     """Check the content of citations.txt is accessible."""
     with GBIFResultsReader(
             sample_data_path('gbif-results.zip')) as results_dwca:
         self.assertEqual(self.CITATIONS_CONTENT, results_dwca.citations)