示例#1
0
def pull_occurrences_from_ala(params, dest_url, context, import_multspecies_params):
    # 1. set progress
    set_progress('RUNNING', 'Download occurrence dataset from ala', None, context)
    # 2. Download all the occurrence dataset in the params list
    results = []

    try:
        item, results = download_occurrence_from_ala(params, context)

        # This is the zip file path of the occurrence dataset
        ala_csv = item.get('file').get('url').split('file://')[1]

        # Add the number of occurrence records to the metadata
        # TODO: This is a hack. Any better solution.
        occurrence_csv_filename = os.path.join('data', 'ala_occurrence.csv')
        if occurrence_csv_filename in item['filemetadata']:
            # FIXME: copy all occurrence metadata to zip level, for backwards
            # compatibility... this should go away after we fully support 'layered'
            # occurrence zips.
            for key in ('rows', 'headers', 'bounds'):  # what about 'species' ?
                if key in item['filemetadata'][occurrence_csv_filename]['metadata']:
                    item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key]

        # move data file to destination and build data_url
        src = build_source('file://{}'.format(ala_csv))
        dst = build_destination(os.path.join(dest_url, os.path.basename(ala_csv)), app.conf.get('bccvl', {}))
        item['file']['url'] = dst['url']
        movelib.move(src, dst)

        # tell importer about new dataset (import it)
        set_progress("RUNNING", u"Import dataset '{0}' from ALA".format(item['title']), None, context)
        cleanup_job = import_cleanup_job(dest_url, context)
        import_job = import_ala_job([item], dest_url, context)
        import_job.link_error(set_progress_job(
            "FAILED", u"Import of dataset '{0}' from ALA failed".format(item['title']), None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job("COMPLETED", u"ALA import '{}' complete".format(item['title']), None, context)

        # Split multi-species dataset
        if import_multspecies_params:
            import_multispecies_job = import_multi_species_csv_job(item.get('file').get('url'),
                                                                   import_multspecies_params['results_dir'],
                                                                   import_multspecies_params['import_context'],
                                                                   context)
            import_multispecies_job.link_error(set_progress_job(
                "FAILED", u"Split multi-species dataset '{0}' from ALA failed".format(item['title']), None, context))
            import_multispecies_job.link_error(cleanup_job)
            (import_job | import_multispecies_job | cleanup_job | finish_job).delay()
        else:
            (import_job | cleanup_job | finish_job).delay()

    except Exception as e:
        set_progress('FAILED', 'Download occurrence dataset from ALA: {}'.format(e), None, context)
        import_cleanup(dest_url, context)
        LOG.error('Download from %s to %s failed: %s', params, dest_url, e, exc_info=True)
    finally:
        for tmpdir in results:
            if tmpdir and os.path.exists(tmpdir):
                shutil.rmtree(tmpdir)
示例#2
0
def pull_occurrences_from_gbif(lsid, dest_url, context):
    # 1. set progress
    set_progress('RUNNING', 'Download {0} from gbif'.format(lsid), None, context)
    # 2. do move
    src = None
    dst = None
    try:
        tmpdir = tempfile.mkdtemp(prefix='gbif_download_')
        src = build_source('gbif://gbif?lsid={}'.format(lsid))
        dst = build_destination('file://{}'.format(tmpdir))
        movelib.move(src, dst)
        # extract metadata and do other stuff....
        set_progress('RUNNING', 'Extract metadata {0} from gbif'.format(lsid), None, context)
        # open gbif_dateset.json
        gbif_ds = json.load(open(os.path.join(tmpdir, 'gbif_dataset.json'), 'r'))
        # collect files inside ds per datatype
        files = dict(((f['dataset_type'], f) for f in gbif_ds['files']))
        # read gbif metadata from attribution file
        gbif_md = json.load(open(files['attribution']['url'], 'r'))
        gbif_csv = files['occurrence']['url']

        # build bccvl metadata:
        bccvlmd = {
            'genre': 'DataGenreSpeciesOccurrence',
            'categories': ['occurrence'],
            'species': {
                'scientificName': gbif_md.get('scientificName', None),
                'vernacularName': gbif_md.get('vernacularName', None),
                'taxonID': gbif_md.get('key', None),
                'rank': gbif_md.get('rank', None),
                'genus': gbif_md.get('genus', None),
                'genusGuid': gbif_md.get('genusKey', None),
                'family': gbif_md.get('family', None),
                'familyGuid': gbif_md.get('familyKey', None),
                'order': gbif_md.get('order', None),
                'orderGuid': gbif_md.get('orderKey', None),
                'clazz': gbif_md.get('class', None),
                'clazzGuid': gbif_md.get('classKey', None),
                'phylum': gbif_md.get('phylum', None),
                'phylumGuid': gbif_md.get('phylumKey', None),
                'kingdom': gbif_md.get('kingdom', None),
                'kingdomGuid': gbif_md.get('kingdomKey', None)
            },
        }
        # build item to import
        item = {
            'title': gbif_ds['title'],
            'description': gbif_ds['description'],
            'file': {
                'url': 'file://{}'.format(gbif_csv),  # local file url
                'contenttype': 'application/zip',
                'filename': os.path.basename(gbif_csv)
            },
            'bccvlmetadata': bccvlmd,
            'filemetadata': extract_metadata(gbif_csv, 'application/zip'),
        }

        # Add the number of occurrence records to the metadata
        # TODO: This is a hack. Any better solution.
        occurrence_csv_filename = os.path.join('data', 'gbif_occurrence.csv')
        if occurrence_csv_filename in item['filemetadata']:
            # FIXME: copy all occurrence metadata to zip level, for backwards
            # compatibility... this should go away after we fully support 'layered'
            # occurrence zips.
            for key in ('rows', 'headers', 'bounds'):  # what about 'species' ?
                if key in item['filemetadata'][occurrence_csv_filename]['metadata']:
                    item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key]

        # move data file to destination and build data_url
        src = build_source('file://{}'.format(gbif_csv))
        dst = build_destination(os.path.join(dest_url, os.path.basename(gbif_csv)), app.conf.get('bccvl', {}))
        item['file']['url'] = dst['url']
        movelib.move(src, dst)
        # tell importer about new dataset (import it)
        set_progress('RUNNING', 'Import gbif data {0}'.format(lsid), None, context)
        cleanup_job = import_cleanup_job(dest_url, context)
        import_job = import_ala_job([item], dest_url, context)
        import_job.link_error(set_progress_job("FAILED", "Import of gbif data failed {0}".format(lsid), None, context))
        import_job.link_error(cleanup_job)
        finish_job = set_progress_job("COMPLETED", 'GBIF import {} complete'.format(lsid), None, context)
        (import_job | cleanup_job | finish_job).delay()

    except Exception as e:
        set_progress('FAILED', 'Download {0} from gbif: {1}'.format(lsid, e), None, context)
        import_cleanup(dest_url, context)
        LOG.error('Download from %s to %s failed: %s', src, dest_url, e, exc_info=True)
    finally:
        if tmpdir and os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)