def update_metadata(url, filename, contenttype, context): try: set_progress('RUNNING', 'Download {0}'.format(url), None, context) tmpdir = tempfile.mkdtemp() tmpfile = '{}/{}'.format(tmpdir, filename) userid = context.get('user', {}).get('id') settings = app.conf.get('bccvl', {}) src = build_source(url, userid, settings) dst = build_destination('file://{}'.format(tmpfile), settings) movelib.move(src, dst) item = { 'filemetadata': extract_metadata(tmpfile, contenttype) } # Check that there are lon and lat columns # if upload is of type csv, we validate column names as well if contenttype == 'text/csv': if ('headers' not in item['filemetadata'] or 'lat' not in item['filemetadata']['headers'] or 'lon' not in item['filemetadata']['headers']): raise Exception("Missing 'lat'/'lon' column") set_progress('RUNNING', 'Import metadata for {0}'.format(url), None, context) import_job = import_file_metadata_job([item], url, context) import_job.link_error(set_progress_job( "FAILED", "Metadata update failed for {0}".format(url), None, context)) finish_job = set_progress_job( "COMPLETED", 'Metadata update for {} complete'.format(url), None, context) (import_job | finish_job).delay() except Exception as e: set_progress('FAILED', 'Metadata update for {} failed: {}'.format(url, e), None, context) LOG.error('Metadata update for %s failed: %s', url, e, exc_info=True) finally: if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir)
def download_occurrence_from_ala(params, context): results = [] species = [] # a list of species metadata ds_names = [] for dataset in params: src = None dst = None occurrence_url = dataset['url'].rstrip('/') + "/occurrences/index/download" query = dataset['query'] # i.e. qid:<qid> or lsid:<lsid> qfilter = "zeroCoordinates,badlyFormedBasisOfRecord,detectedOutlier,decimalLatLongCalculationFromEastingNorthingFailed,missingBasisOfRecord,decimalLatLongCalculationFromVerbatimFailed,coordinatesCentreOfCountry,geospatialIssue,coordinatesOutOfRange,speciesOutsideExpertRange,userVerified,processingError,decimalLatLongConverionFailed,coordinatesCentreOfStateProvince,habitatMismatch" email = context.get('user', {}).get('email', '') ds_names.append(dataset.get('name', '')) # downlaod occurrence file # TODO: ignore file if not successfully download (exception), but continue?? tmpdir = tempfile.mkdtemp(prefix='ala_download_') results.append(tmpdir) src = build_source('ala://ala?url={}&query={}&filter={}&email={}'.format(occurrence_url, query, qfilter, email)) dst = build_destination('file://{}'.format(tmpdir)) movelib.move(src, dst) # extract metadata and do other stuff.... set_progress('RUNNING', 'Extract metadata for {0} from ala'.format(dataset['query']), None, context) # open ala_dateset.json ala_ds = json.load(open(os.path.join(tmpdir, 'ala_dataset.json'), 'r')) # collect files inside ds per datatype files = dict(((f['dataset_type'], f) for f in ala_ds['files'])) # occurrence data file ala_csv = files['occurrence']['url'] # this is actually a zip file now # read ala metadata from attribution file. # May not have metadata for user uploaded dataset into sandbox if files.get('attribution'): ala_md_list = json.load(open(files['attribution']['url'], 'r')) for md in ala_md_list: species.append({ 'scientificName': md.get('scientificName'), 'vernacularName': md.get('commonNameSingle') or md.get('scientificName'), 'taxonID': md.get('guid'), 'rank': md.get('rank'), 'genus': md.get('genus'), 'family': md.get('family'), 'order': md.get('order'), 'clazz': md.get('classs'), 'phylum': md.get('phylum'), 'kingdom': md.get('kingdom') }) # Shall not happen if len(results) == 0: raise Exception("No occurrence dataset is downloaded from ALA") # Combine all the occurrence and citation files from each download into 1 dataset imported_date = datetime.datetime.now().strftime('%d/%m/%Y') if len(results) > 1: destdir = tempfile.mkdtemp(prefix='ala_download_') results.append(destdir) os.mkdir(os.path.join(destdir, 'data')) combine_csv(results[:-1], 'data/ala_occurrence.csv', destdir) combine_csv(results[:-1], 'data/ala_citation.csv', destdir) # Zip it out and point to the new zip file ala_csv = os.path.join(destdir, 'ala_occurrence.zip') zip_occurrence_data(ala_csv, os.path.join(destdir, 'data'), ['ala_occurrence.csv', 'ala_citation.csv']) # Make a title & description for multispecies dataset ds_name = ', '.join([name for name in ds_names if name]) if ds_name: title = ds_name else: ds_name = ','.join([sp['scientificName'] for sp in species]) title = "{} occurrences".format(ds_name) description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date) else: ds_name = ', '.join([name for name in ds_names if name]) if ds_name: title = ds_name description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date) else: title = ala_ds['title'] description = ala_ds['description'] species = species[0] # build bccvl metadata: bccvlmd = { 'genre': 'DataGenreSpeciesOccurrence', 'categories': ['occurrence'], 'species': species } # build item to import item = { 'title': title, 'description': description, 'file': { 'url': 'file://{}'.format(ala_csv), # local file url 'contenttype': 'application/zip', 'filename': os.path.basename(ala_csv) }, 'bccvlmetadata': bccvlmd, 'filemetadata': extract_metadata(ala_csv, 'application/zip'), } return (item, results)
def createItem(fname, info, params): # fname: full path to file # info: ... from outputmap name = os.path.basename(fname) # layermd ... metadata about raster layer layermd = {} # bccvlmd ... bccvl specific metadata bccvlmd = {} genre = info.get('genre', None) if genre: bccvlmd['genre'] = genre if genre in ('DataGenreSDMModel', 'DataGenreCP', 'DataGenreCP_ENVLOP', 'DataGenreClampingMask'): if genre == 'DataGenreClampingMask': layermd = { 'files': {name: {'layer': 'clamping_mask', 'data_type': 'Discrete'}}} elif genre in ('DataGenreCP', 'DataGenreCP_ENVLOP'): if params['function'] in ('circles', 'convhull', 'voronoihull'): layermd = { 'files': {name: {'layer': 'projection_binary', 'data_type': 'Continuous'}}} elif params['function'] in ('maxent',): layermd = { 'files': {name: {'layer': 'projection_suitablity', 'data_type': 'Continuous'}}} else: layermd = {'files': { name: {'layer': 'projection_probability', 'data_type': 'Continuous'}}} # FIXME: find a cleaner way to attach metadata for key in ('year', 'month', 'emsc', 'gcm'): if key in params: bccvlmd[key] = params[key] elif genre == 'DataGenreSDMEval' and info.get('mimetype') == 'text/csv': # Only get threshold value as from the output of Sama's evaluation script # FIXME: should not depend on file name (has already changed once # and caused disappearance of threshold values in biodiverse) if fname.endswith('Loss function intervals table.csv'): thresholds = extractThresholdValues(fname) # FIXME: merge thresholds? bccvlmd['thresholds'] = thresholds elif genre == 'DataGenreBiodiverseOutput': # Add in the srs and cellsize for Biodiverse bccvlmd['srs'] = 'epsg:3577' bccvlmd['cellsize'] = params['cluster_size'] # make sure we have a mimetype mimetype = info.get('mimetype', None) if mimetype is None: mimetype = guess_mimetype(fname) # extract file metadata filemd = extract_metadata(fname, mimetype) # FIXME: check keys to make sense # -> merge layermd and filemetadata? # -> merge bccvlmd and filemetadata? return { 'file': { 'url': 'file://{}'.format(fname), # local file url 'contenttype': mimetype, 'filename': name }, 'title': name, 'description': info.get('title', u''), 'bccvlmetadata': bccvlmd, 'filemetadata': filemd, 'layermd': layermd, 'order': info.get('order', 999999) }
def pull_occurrences_from_gbif(lsid, dest_url, context): # 1. set progress set_progress('RUNNING', 'Download {0} from gbif'.format(lsid), None, context) # 2. do move src = None dst = None try: tmpdir = tempfile.mkdtemp(prefix='gbif_download_') src = build_source('gbif://gbif?lsid={}'.format(lsid)) dst = build_destination('file://{}'.format(tmpdir)) movelib.move(src, dst) # extract metadata and do other stuff.... set_progress('RUNNING', 'Extract metadata {0} from gbif'.format(lsid), None, context) # open gbif_dateset.json gbif_ds = json.load(open(os.path.join(tmpdir, 'gbif_dataset.json'), 'r')) # collect files inside ds per datatype files = dict(((f['dataset_type'], f) for f in gbif_ds['files'])) # read gbif metadata from attribution file gbif_md = json.load(open(files['attribution']['url'], 'r')) gbif_csv = files['occurrence']['url'] # build bccvl metadata: bccvlmd = { 'genre': 'DataGenreSpeciesOccurrence', 'categories': ['occurrence'], 'species': { 'scientificName': gbif_md.get('scientificName', None), 'vernacularName': gbif_md.get('vernacularName', None), 'taxonID': gbif_md.get('key', None), 'rank': gbif_md.get('rank', None), 'genus': gbif_md.get('genus', None), 'genusGuid': gbif_md.get('genusKey', None), 'family': gbif_md.get('family', None), 'familyGuid': gbif_md.get('familyKey', None), 'order': gbif_md.get('order', None), 'orderGuid': gbif_md.get('orderKey', None), 'clazz': gbif_md.get('class', None), 'clazzGuid': gbif_md.get('classKey', None), 'phylum': gbif_md.get('phylum', None), 'phylumGuid': gbif_md.get('phylumKey', None), 'kingdom': gbif_md.get('kingdom', None), 'kingdomGuid': gbif_md.get('kingdomKey', None) }, } # build item to import item = { 'title': gbif_ds['title'], 'description': gbif_ds['description'], 'file': { 'url': 'file://{}'.format(gbif_csv), # local file url 'contenttype': 'application/zip', 'filename': os.path.basename(gbif_csv) }, 'bccvlmetadata': bccvlmd, 'filemetadata': extract_metadata(gbif_csv, 'application/zip'), } # Add the number of occurrence records to the metadata # TODO: This is a hack. Any better solution. occurrence_csv_filename = os.path.join('data', 'gbif_occurrence.csv') if occurrence_csv_filename in item['filemetadata']: # FIXME: copy all occurrence metadata to zip level, for backwards # compatibility... this should go away after we fully support 'layered' # occurrence zips. for key in ('rows', 'headers', 'bounds'): # what about 'species' ? if key in item['filemetadata'][occurrence_csv_filename]['metadata']: item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key] # move data file to destination and build data_url src = build_source('file://{}'.format(gbif_csv)) dst = build_destination(os.path.join(dest_url, os.path.basename(gbif_csv)), app.conf.get('bccvl', {})) item['file']['url'] = dst['url'] movelib.move(src, dst) # tell importer about new dataset (import it) set_progress('RUNNING', 'Import gbif data {0}'.format(lsid), None, context) cleanup_job = import_cleanup_job(dest_url, context) import_job = import_ala_job([item], dest_url, context) import_job.link_error(set_progress_job("FAILED", "Import of gbif data failed {0}".format(lsid), None, context)) import_job.link_error(cleanup_job) finish_job = set_progress_job("COMPLETED", 'GBIF import {} complete'.format(lsid), None, context) (import_job | cleanup_job | finish_job).delay() except Exception as e: set_progress('FAILED', 'Download {0} from gbif: {1}'.format(lsid, e), None, context) import_cleanup(dest_url, context) LOG.error('Download from %s to %s failed: %s', src, dest_url, e, exc_info=True) finally: if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir)
def import_multi_species_csv(url, results_dir, import_context, context): # url .... source file # results_dir ... folder to place split files into # context ... the context with user and orig dataset try: set_progress('RUNNING', 'Split {0}'.format(url), None, context) # step 1: update main dataset metadata tmpdir = tempfile.mkdtemp() userid = context.get('user', {}).get('id') settings = app.conf.get('bccvl', {}) src = build_source(url, userid, settings) dst = build_destination('file://{}'.format(tmpdir), settings) movelib.move(src, dst) # Get the downloaded filename tmpfile = glob.glob(os.path.join(tmpdir, '*'))[0] # Extract occurrence file from downloaded file mimetype, enc = mimetypes.guess_type(tmpfile) if mimetype == 'application/zip': src_occ_data = os.path.join('data', 'ala_occurrence.csv') with zipfile.ZipFile(tmpfile, 'r') as zipf: occfile = os.path.join(tmpdir, src_occ_data) zipf.extract(src_occ_data, tmpdir) item = { 'filemetadata': extract_metadata(tmpfile, 'application/zip') } occmd = item['filemetadata'].get(src_occ_data, {}).get('metadata', {}) else: # csv file item = { 'filemetadata': extract_metadata(tmpfile, "text/csv") } occfile = tmpfile occmd = item['filemetadata'] # Check that there are lon and lat columns # if upload is of type csv, we validate column names as well if ('headers' not in occmd or 'lat' not in occmd['headers'] or 'lon' not in occmd['headers']): raise Exception("Missing 'lat'/'lon' column") set_progress('RUNNING', 'Import metadata for {0}'.format(url), None, context) import_md_job = import_file_metadata_job([item], url, context) import_md_job.link_error(set_progress_job( "FAILED", "Metadata update failed for {0}".format(url), None, context)) # step 2: split csv file and create sub datasets # start reading csv file and create new datasets which will be # linked up with dataset collection item # FIXME: large csv files should be streamed to seperate files (not read # into ram like here) f = io.open(occfile, 'r', encoding='utf-8', errors='ignore') csvreader = UnicodeCSVReader(f) headers = csvreader.next() if 'species' not in headers: raise Exception('missing species column') speciesidx = headers.index('species') # create dict with all data .... species column used as key, and rest # is just added data = {} for row in csvreader: if not row: continue species = row[speciesidx] if species not in data: # create new entry for species fname = u'{0}.csv'.format(species).replace( u'/', u'_').encode('idna') # TODO: make sure fname contains only legal filename characters fpath = os.path.join(tmpdir, fname) file = io.open(fpath, 'wb') fwriter = UnicodeCSVWriter(file) fwriter.writerow(headers) data[species] = { 'file': file, 'writer': fwriter, 'path': fpath, 'name': fname } data[species]['writer'].writerow(row) # ok we have got all data and everything in separate files # close all files for species in data: data[species]['file'].close() del data[species]['file'] del data[species]['writer'] # extract metadata for species in data: data[species]['filemetadata'] = extract_metadata( data[species]['path'], 'text/csv' ) # send files to destination for species in data: src = build_source('file://{}'.format(data[species]['path'])) dst = build_destination(os.path.join(results_dir, data[species]['name']), app.conf.get('bccvl', {})) data[species]['url'] = dst['url'] movelib.move(src, dst) # all files uploaded .... send import jobs set_progress('RUNNING', 'Create datasets for {0}'.format( url), None, context) items = [] for species in data: # build item item = { 'title': u'{0} occurrences'.format(species), 'description': '', 'file': { 'url': data[species]['url'], 'filename': data[species]['name'], 'contenttype': 'text/csv', }, 'bccvlmetadata': { 'genre': 'DataGenreSpeciesOccurrence', 'categories': ['occurrence'], 'species': { 'scientificName': species, } }, 'filemetadata': data[species]['filemetadata'], '_partof': { # add back reference to orig dataset # TODO: shouldn't use absolute path here 'path': context['context'] } } items.append(item) # start import process start_import = set_progress_job( 'RUNNING', 'Import results', None, context) # What is results_dir being used for? import_job = import_result_job(items, results_dir, import_context) cleanup_job = import_cleanup_job(results_dir, context) import_job.link_error(set_progress_job( 'FAILED', 'Multi species import failed', None, context)) import_job.link_error(cleanup_job) finish_job = set_progress_job( 'COMPLETED', 'Task succeeded', None, context) (start_import | import_md_job | import_job | cleanup_job | finish_job).delay() # FIXME: missing stuff... # need to set multi species collection to finished at some stage except Exception as e: set_progress('FAILED', 'Error while splitting Multi Species CSV {}: {}'.format( url, e), None, context) LOG.error('Multi species split for %s faild: %s', url, e, exc_info=True) finally: if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir)