def _download_occurrence(occurrence_url, dest): """ Downloads Species Occurrence data from ALA (Atlas of Living Australia) @param download_url: the url to download species occurrence data @type download_url: str @param dest: the destination directory that the ALA files are going to end up inside of on the remote machine. Used to form the metadata .json file. @type dest: str @return True if the dataset was obtained. False otherwise """ # TODO: validate dest is a dir? # Get occurrence data temp_file = None lsid_list = [] try: temp_file, _ = urllib.urlretrieve(occurrence_url) # extract data.csv file into dest with zipfile.ZipFile(temp_file) as z: data_dest = os.path.join(dest, 'data') os.mkdir(data_dest) # rename to ala_occurrence.csv z.extract('data.csv', dest) os.rename(os.path.join(dest, 'data.csv'), os.path.join(data_dest, 'ala_occurrence.csv')) # citation file is optional try: z.extract('citation.csv', dest) os.rename(os.path.join(dest, 'citation.csv'), os.path.join(data_dest, 'ala_citation.csv')) except Exception: pass lsid_list = _get_species_guid_from_csv(os.path.join(data_dest, 'ala_occurrence.csv')) # Zip out files if available zip_occurrence_data(os.path.join(dest, 'ala_occurrence.zip'), os.path.join(dest, 'data'), ['ala_occurrence.csv', 'ala_citation.csv']) except KeyError: LOG.error("Cannot find file %s in downloaded zip file", 'data.csv', exc_info=True) raise except Exception: # TODO: Not a zip file error.... does it have to raise? LOG.error("The file %s is not a zip file", 'data.csv', exc_info=True) raise finally: if temp_file: os.remove(temp_file) return {'url': os.path.join(dest, 'ala_occurrence.zip'), 'name': 'ala_occurrence.zip', 'content_type': 'application/zip', 'lsids': lsid_list}
def _ala_postprocess(csvzipfile, mdfile, occurrence_url, dest): # cleanup occurrence csv file and generate dataset metadata # occurrence dataset can be multiple species, i.e. user upload data taxon_names = {} common_names = [] if mdfile: # Generate dataset .json # 1. read mdfile and find interesting bits: sp_metadata = json.load(open(mdfile)) for md in sp_metadata: # TODO: is this the correct bit? (see plone dataset import ) guid = md.get('guid') if guid: taxon_names[guid] = md.get('scientificName') or \ md.get('name') or \ md.get('nameComplete') common_names.append(md.get('commonNameSingle') or md.get('scientificName')) # 2. clean up occurrence csv file and count occurrence points csvfile = os.path.join(dest, 'data/ala_occurrence.csv') num_occurrences = _normalize_occurrence(csvfile, taxon_names) # Rebuild the zip archive file with updated occurrence csv file. os.remove(csvzipfile) zip_occurrence_data(csvzipfile, os.path.join(os.path.dirname(csvzipfile), 'data'), ['ala_occurrence.csv', 'ala_citation.csv']) # 3. generate ala_dataset.json imported_date = datetime.datetime.now().strftime('%d/%m/%Y') common = u', '.join(common_names) taxon = u', '.join(taxon_names.values()) if common_names: title = u"%s (%s) occurrences" % (common, taxon) description = u"Observed occurrences for %s (%s), imported from ALA on %s" % (common, taxon, imported_date) elif taxon: title = u"%s occurrences" % (taxon) description = u"Observed occurrences for %s, imported from ALA on %s" % (taxon, imported_date) else: # This would be the case where the user dataset does not match to any species in ALA # TODO: Use the user supplied name title = u"Occurrence for user defined dataset" description = u"User defined occurrence dataset, imported on %s" % (imported_date) files = [{ 'url': csvzipfile, 'dataset_type': 'occurrence', 'size': os.path.getsize(csvzipfile) }] if mdfile: files.append({ 'url': mdfile, 'dataset_type': 'attribution', 'size': os.path.getsize(mdfile) }) ala_dataset = { 'title': title, 'description': description, 'num_occurrences': num_occurrences, 'files': files, 'provenance': { 'source': 'ALA', 'url': occurrence_url, 'source_date': imported_date } } # Write the dataset to a file dataset_path = os.path.join(dest, 'ala_dataset.json') f = io.open(dataset_path, mode='wb') json.dump(ala_dataset, f, indent=2) f.close() dsfile = {'url': dataset_path, 'name': 'ala_dataset.json', 'content_type': 'application/json'} return dsfile
def download_occurrence_from_ala(params, context): results = [] species = [] # a list of species metadata ds_names = [] for dataset in params: src = None dst = None occurrence_url = dataset['url'].rstrip('/') + "/occurrences/index/download" query = dataset['query'] # i.e. qid:<qid> or lsid:<lsid> qfilter = "zeroCoordinates,badlyFormedBasisOfRecord,detectedOutlier,decimalLatLongCalculationFromEastingNorthingFailed,missingBasisOfRecord,decimalLatLongCalculationFromVerbatimFailed,coordinatesCentreOfCountry,geospatialIssue,coordinatesOutOfRange,speciesOutsideExpertRange,userVerified,processingError,decimalLatLongConverionFailed,coordinatesCentreOfStateProvince,habitatMismatch" email = context.get('user', {}).get('email', '') ds_names.append(dataset.get('name', '')) # downlaod occurrence file # TODO: ignore file if not successfully download (exception), but continue?? tmpdir = tempfile.mkdtemp(prefix='ala_download_') results.append(tmpdir) src = build_source('ala://ala?url={}&query={}&filter={}&email={}'.format(occurrence_url, query, qfilter, email)) dst = build_destination('file://{}'.format(tmpdir)) movelib.move(src, dst) # extract metadata and do other stuff.... set_progress('RUNNING', 'Extract metadata for {0} from ala'.format(dataset['query']), None, context) # open ala_dateset.json ala_ds = json.load(open(os.path.join(tmpdir, 'ala_dataset.json'), 'r')) # collect files inside ds per datatype files = dict(((f['dataset_type'], f) for f in ala_ds['files'])) # occurrence data file ala_csv = files['occurrence']['url'] # this is actually a zip file now # read ala metadata from attribution file. # May not have metadata for user uploaded dataset into sandbox if files.get('attribution'): ala_md_list = json.load(open(files['attribution']['url'], 'r')) for md in ala_md_list: species.append({ 'scientificName': md.get('scientificName'), 'vernacularName': md.get('commonNameSingle') or md.get('scientificName'), 'taxonID': md.get('guid'), 'rank': md.get('rank'), 'genus': md.get('genus'), 'family': md.get('family'), 'order': md.get('order'), 'clazz': md.get('classs'), 'phylum': md.get('phylum'), 'kingdom': md.get('kingdom') }) # Shall not happen if len(results) == 0: raise Exception("No occurrence dataset is downloaded from ALA") # Combine all the occurrence and citation files from each download into 1 dataset imported_date = datetime.datetime.now().strftime('%d/%m/%Y') if len(results) > 1: destdir = tempfile.mkdtemp(prefix='ala_download_') results.append(destdir) os.mkdir(os.path.join(destdir, 'data')) combine_csv(results[:-1], 'data/ala_occurrence.csv', destdir) combine_csv(results[:-1], 'data/ala_citation.csv', destdir) # Zip it out and point to the new zip file ala_csv = os.path.join(destdir, 'ala_occurrence.zip') zip_occurrence_data(ala_csv, os.path.join(destdir, 'data'), ['ala_occurrence.csv', 'ala_citation.csv']) # Make a title & description for multispecies dataset ds_name = ', '.join([name for name in ds_names if name]) if ds_name: title = ds_name else: ds_name = ','.join([sp['scientificName'] for sp in species]) title = "{} occurrences".format(ds_name) description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date) else: ds_name = ', '.join([name for name in ds_names if name]) if ds_name: title = ds_name description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date) else: title = ala_ds['title'] description = ala_ds['description'] species = species[0] # build bccvl metadata: bccvlmd = { 'genre': 'DataGenreSpeciesOccurrence', 'categories': ['occurrence'], 'species': species } # build item to import item = { 'title': title, 'description': description, 'file': { 'url': 'file://{}'.format(ala_csv), # local file url 'contenttype': 'application/zip', 'filename': os.path.basename(ala_csv) }, 'bccvlmetadata': bccvlmd, 'filemetadata': extract_metadata(ala_csv, 'application/zip'), } return (item, results)