def pull_occurrences_from_ala(params, dest_url, context, import_multspecies_params): # 1. set progress set_progress('RUNNING', 'Download occurrence dataset from ala', None, context) # 2. Download all the occurrence dataset in the params list results = [] try: item, results = download_occurrence_from_ala(params, context) # This is the zip file path of the occurrence dataset ala_csv = item.get('file').get('url').split('file://')[1] # Add the number of occurrence records to the metadata # TODO: This is a hack. Any better solution. occurrence_csv_filename = os.path.join('data', 'ala_occurrence.csv') if occurrence_csv_filename in item['filemetadata']: # FIXME: copy all occurrence metadata to zip level, for backwards # compatibility... this should go away after we fully support 'layered' # occurrence zips. for key in ('rows', 'headers', 'bounds'): # what about 'species' ? if key in item['filemetadata'][occurrence_csv_filename]['metadata']: item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key] # move data file to destination and build data_url src = build_source('file://{}'.format(ala_csv)) dst = build_destination(os.path.join(dest_url, os.path.basename(ala_csv)), app.conf.get('bccvl', {})) item['file']['url'] = dst['url'] movelib.move(src, dst) # tell importer about new dataset (import it) set_progress("RUNNING", u"Import dataset '{0}' from ALA".format(item['title']), None, context) cleanup_job = import_cleanup_job(dest_url, context) import_job = import_ala_job([item], dest_url, context) import_job.link_error(set_progress_job( "FAILED", u"Import of dataset '{0}' from ALA failed".format(item['title']), None, context)) import_job.link_error(cleanup_job) finish_job = set_progress_job("COMPLETED", u"ALA import '{}' complete".format(item['title']), None, context) # Split multi-species dataset if import_multspecies_params: import_multispecies_job = import_multi_species_csv_job(item.get('file').get('url'), import_multspecies_params['results_dir'], import_multspecies_params['import_context'], context) import_multispecies_job.link_error(set_progress_job( "FAILED", u"Split multi-species dataset '{0}' from ALA failed".format(item['title']), None, context)) import_multispecies_job.link_error(cleanup_job) (import_job | import_multispecies_job | cleanup_job | finish_job).delay() else: (import_job | cleanup_job | finish_job).delay() except Exception as e: set_progress('FAILED', 'Download occurrence dataset from ALA: {}'.format(e), None, context) import_cleanup(dest_url, context) LOG.error('Download from %s to %s failed: %s', params, dest_url, e, exc_info=True) finally: for tmpdir in results: if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir)
def update_metadata(url, filename, contenttype, context): try: set_progress('RUNNING', 'Download {0}'.format(url), None, context) tmpdir = tempfile.mkdtemp() tmpfile = '{}/{}'.format(tmpdir, filename) userid = context.get('user', {}).get('id') settings = app.conf.get('bccvl', {}) src = build_source(url, userid, settings) dst = build_destination('file://{}'.format(tmpfile), settings) movelib.move(src, dst) item = { 'filemetadata': extract_metadata(tmpfile, contenttype) } # Check that there are lon and lat columns # if upload is of type csv, we validate column names as well if contenttype == 'text/csv': if ('headers' not in item['filemetadata'] or 'lat' not in item['filemetadata']['headers'] or 'lon' not in item['filemetadata']['headers']): raise Exception("Missing 'lat'/'lon' column") set_progress('RUNNING', 'Import metadata for {0}'.format(url), None, context) import_job = import_file_metadata_job([item], url, context) import_job.link_error(set_progress_job( "FAILED", "Metadata update failed for {0}".format(url), None, context)) finish_job = set_progress_job( "COMPLETED", 'Metadata update for {} complete'.format(url), None, context) (import_job | finish_job).delay() except Exception as e: set_progress('FAILED', 'Metadata update for {} failed: {}'.format(url, e), None, context) LOG.error('Metadata update for %s failed: %s', url, e, exc_info=True) finally: if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir)
def run_script(wrapper, params, context): # TODO: there are many little things that can fail here, and we # need to communicate it properly back to the user. # TODO: however, we can't really do anything in case sending # messages doesn't work. items = [] try: errmsg = 'Fail to transfer/import data' set_progress('RUNNING', 'Transferring data', None, context) # create initial folder structure create_workenv(params) # transfer input files transfer_inputs(params, context) # create script scriptname = create_scripts(params, context) # run the script errmsg = 'Fail to run experiement' set_progress('RUNNING', 'Executing job', None, context) scriptout = os.path.join(params['env']['outputdir'], params['worker']['script']['name'] + 'out') outfile = open(scriptout, 'w') wrapsh = os.path.join(params['env']['scriptdir'], 'wrap.sh') open(wrapsh, 'w').write(wrapper) # zip up workenv if requested if params['worker'].get('zipworkenv', False): # make sure tmp is big enough # TODO: add toolkit name to zip name ... workenv_bioclim.zip zip_folder(os.path.join(params['env']['outputdir'], 'workenv.zip'), params['env']['workdir']) cmd = ["/bin/bash", "-l", "wrap.sh", scriptname] LOG.info("Executing: %s", ' '.join(cmd)) proc = subprocess.Popen(cmd, cwd=params['env']['scriptdir'], close_fds=True, stdout=outfile, stderr=subprocess.STDOUT) rpid, ret, rusage = os.wait4(proc.pid, 0) # TODO: should we write this as json file and send as result back # or just send rusage with finished message? usage = get_rusage(rusage) # TODO: check whether ret and proc.returncode are the same # move results back errmsg = 'Fail to transfer results back' set_progress('RUNNING', 'Transferring outputs', usage, context) # TODO: maybe redesign this? # transfer only uploads to destination and stores new url somewhere # and we do metadata extraction and item creation afterwards (here)? items = transfer_outputs(params, context) # we are done here, hand over to result importer # build a chain of the remaining tasks start_import = set_progress_job( 'RUNNING', 'Import results', None, context) cleanup_job = import_cleanup_job( params['result']['results_dir'], context) import_job = import_result_job(items, params['result'][ 'results_dir'], context) import_job.link_error(set_progress_job( 'FAILED', 'Result import failed', None, context)) import_job.link_error(cleanup_job) if ret != 0: errmsg = 'Script execution failed with exit code {0}'.format(ret) finish_job = set_progress_job('FAILED', errmsg, None, context) else: finish_job = set_progress_job( 'COMPLETED', 'Task succeeded', None, context) (start_import | import_job | cleanup_job | finish_job).delay() except Exception as e: # TODO: capture stacktrace # need to start import to get import cleaned up # Log error message with stacktrace. #:( exposes internals, ugly hash, complicated with admin only access #-> certainly need to get rid of exception in message. # test exceptions: # ... upload file, replace with something else (unzip error) # ... delete file and rerun experiment (donwload error) # ... create file/folder error? (can't write log) # ... how to simulate fault? (download error) # log error message with exception and traceback LOG.error(errmsg, exc_info=True) start_import = set_progress_job( 'RUNNING', 'Import results', None, context) import_job = import_result_job(items, params['result'][ 'results_dir'], context) import_job.link_error(set_progress_job( 'FAILED', 'Result import failed', None, context)) finish_job = set_progress_job('FAILED', errmsg, None, context) (start_import | import_job | finish_job).delay() raise finally: # TODO: check if dir exists path = params['env'].get('workdir', None) if path and os.path.exists(path): shutil.rmtree(path)
def pull_occurrences_from_gbif(lsid, dest_url, context): # 1. set progress set_progress('RUNNING', 'Download {0} from gbif'.format(lsid), None, context) # 2. do move src = None dst = None try: tmpdir = tempfile.mkdtemp(prefix='gbif_download_') src = build_source('gbif://gbif?lsid={}'.format(lsid)) dst = build_destination('file://{}'.format(tmpdir)) movelib.move(src, dst) # extract metadata and do other stuff.... set_progress('RUNNING', 'Extract metadata {0} from gbif'.format(lsid), None, context) # open gbif_dateset.json gbif_ds = json.load(open(os.path.join(tmpdir, 'gbif_dataset.json'), 'r')) # collect files inside ds per datatype files = dict(((f['dataset_type'], f) for f in gbif_ds['files'])) # read gbif metadata from attribution file gbif_md = json.load(open(files['attribution']['url'], 'r')) gbif_csv = files['occurrence']['url'] # build bccvl metadata: bccvlmd = { 'genre': 'DataGenreSpeciesOccurrence', 'categories': ['occurrence'], 'species': { 'scientificName': gbif_md.get('scientificName', None), 'vernacularName': gbif_md.get('vernacularName', None), 'taxonID': gbif_md.get('key', None), 'rank': gbif_md.get('rank', None), 'genus': gbif_md.get('genus', None), 'genusGuid': gbif_md.get('genusKey', None), 'family': gbif_md.get('family', None), 'familyGuid': gbif_md.get('familyKey', None), 'order': gbif_md.get('order', None), 'orderGuid': gbif_md.get('orderKey', None), 'clazz': gbif_md.get('class', None), 'clazzGuid': gbif_md.get('classKey', None), 'phylum': gbif_md.get('phylum', None), 'phylumGuid': gbif_md.get('phylumKey', None), 'kingdom': gbif_md.get('kingdom', None), 'kingdomGuid': gbif_md.get('kingdomKey', None) }, } # build item to import item = { 'title': gbif_ds['title'], 'description': gbif_ds['description'], 'file': { 'url': 'file://{}'.format(gbif_csv), # local file url 'contenttype': 'application/zip', 'filename': os.path.basename(gbif_csv) }, 'bccvlmetadata': bccvlmd, 'filemetadata': extract_metadata(gbif_csv, 'application/zip'), } # Add the number of occurrence records to the metadata # TODO: This is a hack. Any better solution. occurrence_csv_filename = os.path.join('data', 'gbif_occurrence.csv') if occurrence_csv_filename in item['filemetadata']: # FIXME: copy all occurrence metadata to zip level, for backwards # compatibility... this should go away after we fully support 'layered' # occurrence zips. for key in ('rows', 'headers', 'bounds'): # what about 'species' ? if key in item['filemetadata'][occurrence_csv_filename]['metadata']: item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key] # move data file to destination and build data_url src = build_source('file://{}'.format(gbif_csv)) dst = build_destination(os.path.join(dest_url, os.path.basename(gbif_csv)), app.conf.get('bccvl', {})) item['file']['url'] = dst['url'] movelib.move(src, dst) # tell importer about new dataset (import it) set_progress('RUNNING', 'Import gbif data {0}'.format(lsid), None, context) cleanup_job = import_cleanup_job(dest_url, context) import_job = import_ala_job([item], dest_url, context) import_job.link_error(set_progress_job("FAILED", "Import of gbif data failed {0}".format(lsid), None, context)) import_job.link_error(cleanup_job) finish_job = set_progress_job("COMPLETED", 'GBIF import {} complete'.format(lsid), None, context) (import_job | cleanup_job | finish_job).delay() except Exception as e: set_progress('FAILED', 'Download {0} from gbif: {1}'.format(lsid, e), None, context) import_cleanup(dest_url, context) LOG.error('Download from %s to %s failed: %s', src, dest_url, e, exc_info=True) finally: if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir)
def import_multi_species_csv(url, results_dir, import_context, context): # url .... source file # results_dir ... folder to place split files into # context ... the context with user and orig dataset try: set_progress('RUNNING', 'Split {0}'.format(url), None, context) # step 1: update main dataset metadata tmpdir = tempfile.mkdtemp() userid = context.get('user', {}).get('id') settings = app.conf.get('bccvl', {}) src = build_source(url, userid, settings) dst = build_destination('file://{}'.format(tmpdir), settings) movelib.move(src, dst) # Get the downloaded filename tmpfile = glob.glob(os.path.join(tmpdir, '*'))[0] # Extract occurrence file from downloaded file mimetype, enc = mimetypes.guess_type(tmpfile) if mimetype == 'application/zip': src_occ_data = os.path.join('data', 'ala_occurrence.csv') with zipfile.ZipFile(tmpfile, 'r') as zipf: occfile = os.path.join(tmpdir, src_occ_data) zipf.extract(src_occ_data, tmpdir) item = { 'filemetadata': extract_metadata(tmpfile, 'application/zip') } occmd = item['filemetadata'].get(src_occ_data, {}).get('metadata', {}) else: # csv file item = { 'filemetadata': extract_metadata(tmpfile, "text/csv") } occfile = tmpfile occmd = item['filemetadata'] # Check that there are lon and lat columns # if upload is of type csv, we validate column names as well if ('headers' not in occmd or 'lat' not in occmd['headers'] or 'lon' not in occmd['headers']): raise Exception("Missing 'lat'/'lon' column") set_progress('RUNNING', 'Import metadata for {0}'.format(url), None, context) import_md_job = import_file_metadata_job([item], url, context) import_md_job.link_error(set_progress_job( "FAILED", "Metadata update failed for {0}".format(url), None, context)) # step 2: split csv file and create sub datasets # start reading csv file and create new datasets which will be # linked up with dataset collection item # FIXME: large csv files should be streamed to seperate files (not read # into ram like here) f = io.open(occfile, 'r', encoding='utf-8', errors='ignore') csvreader = UnicodeCSVReader(f) headers = csvreader.next() if 'species' not in headers: raise Exception('missing species column') speciesidx = headers.index('species') # create dict with all data .... species column used as key, and rest # is just added data = {} for row in csvreader: if not row: continue species = row[speciesidx] if species not in data: # create new entry for species fname = u'{0}.csv'.format(species).replace( u'/', u'_').encode('idna') # TODO: make sure fname contains only legal filename characters fpath = os.path.join(tmpdir, fname) file = io.open(fpath, 'wb') fwriter = UnicodeCSVWriter(file) fwriter.writerow(headers) data[species] = { 'file': file, 'writer': fwriter, 'path': fpath, 'name': fname } data[species]['writer'].writerow(row) # ok we have got all data and everything in separate files # close all files for species in data: data[species]['file'].close() del data[species]['file'] del data[species]['writer'] # extract metadata for species in data: data[species]['filemetadata'] = extract_metadata( data[species]['path'], 'text/csv' ) # send files to destination for species in data: src = build_source('file://{}'.format(data[species]['path'])) dst = build_destination(os.path.join(results_dir, data[species]['name']), app.conf.get('bccvl', {})) data[species]['url'] = dst['url'] movelib.move(src, dst) # all files uploaded .... send import jobs set_progress('RUNNING', 'Create datasets for {0}'.format( url), None, context) items = [] for species in data: # build item item = { 'title': u'{0} occurrences'.format(species), 'description': '', 'file': { 'url': data[species]['url'], 'filename': data[species]['name'], 'contenttype': 'text/csv', }, 'bccvlmetadata': { 'genre': 'DataGenreSpeciesOccurrence', 'categories': ['occurrence'], 'species': { 'scientificName': species, } }, 'filemetadata': data[species]['filemetadata'], '_partof': { # add back reference to orig dataset # TODO: shouldn't use absolute path here 'path': context['context'] } } items.append(item) # start import process start_import = set_progress_job( 'RUNNING', 'Import results', None, context) # What is results_dir being used for? import_job = import_result_job(items, results_dir, import_context) cleanup_job = import_cleanup_job(results_dir, context) import_job.link_error(set_progress_job( 'FAILED', 'Multi species import failed', None, context)) import_job.link_error(cleanup_job) finish_job = set_progress_job( 'COMPLETED', 'Task succeeded', None, context) (start_import | import_md_job | import_job | cleanup_job | finish_job).delay() # FIXME: missing stuff... # need to set multi species collection to finished at some stage except Exception as e: set_progress('FAILED', 'Error while splitting Multi Species CSV {}: {}'.format( url, e), None, context) LOG.error('Multi species split for %s faild: %s', url, e, exc_info=True) finally: if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir)