def test_gbif_to_file(self, mock_urlretrieve=None, mock_urlopen=None): mock_urlretrieve.side_effect = self._urlretrieve mock_urlopen.side_effect = self._urlopen # mock urllib.urlretrieve .... # return zip file with data.csv and citation.csv # mock urllib.urlretriev ... # return gbif_metadata.json file_dest = { 'url': 'file://{}'.format(self.tmpdir) } move(self.gbif_source, file_dest) # Check files are created self.assertTrue(os.path.exists( os.path.join(self.tmpdir, 'gbif_dataset.json'))) self.assertTrue(os.path.exists( os.path.join(self.tmpdir, 'gbif_occurrence.zip'))) self.assertTrue(os.path.exists( os.path.join(self.tmpdir, 'gbif_metadata.json'))) # Check file contents zf = zipfile.ZipFile(os.path.join(self.tmpdir, 'gbif_occurrence.zip')) zf.extractall(self.tmpdir) self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'gbif_metadata.json'), pkg_resources.resource_filename(__name__, 'data/gbif_metadata.json'))) self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'data', 'gbif_occurrence.csv'), pkg_resources.resource_filename(__name__, 'data/gbif_occurrence.csv'))) self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'data', 'gbif_citation.txt'), pkg_resources.resource_filename(__name__, 'data/gbif_citation.txt')))
def test_aekos_occurrence_to_file(self, mock_download_as_file=None): mock_download_as_file.side_effect = self._download_as_file file_dest = { 'url': 'file://{0}'.format(self.tmpdir) } move(self.occurrence_source, file_dest) # Check for these files are created self.assertTrue(os.path.exists( os.path.join(self.tmpdir, 'aekos_metadata.json'))) self.assertTrue(os.path.exists( os.path.join(self.tmpdir, 'aekos_dataset.json'))) self.assertTrue(os.path.exists(os.path.join( self.tmpdir, 'aekos_occurrence.zip'))) self.assertTrue(os.path.exists(os.path.join( self.tmpdir, 'data', 'aekos_occurrence.csv'))) self.assertTrue(os.path.exists(os.path.join( self.tmpdir, 'data', 'aekos_citation.txt'))) # Check file content self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'aekos_metadata.json'), resource_filename(__name__, 'data/aekos_metadata.json'))) self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'data', 'aekos_occurrence.csv'), resource_filename(__name__, 'data/aekos_occurrence.csv'))) self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'data', 'aekos_citation.txt'), resource_filename(__name__, 'data/aekos_citation.txt')))
def test_http_to_file(self, mock_SessionClass=None): mock_session = mock_SessionClass.return_value # get mock response mock_response = mock_session.get.return_value mock_response.iter_content.return_value = ['test content'] mock_headers = mock_response.headers mock_headers.get.return_value = 'text/csv' ticket = AuthTkt('ibycgtpw', 'admin') cookies = { 'name': '__ac', 'value': ticket.ticket(), 'domain': '', 'path': '/', 'secure': True } http_source = { 'url': 'http://www.bccvl.org.au/datasets/test.csv', 'cookies': cookies } dest_file = os.path.join(self.tmpdir, 'test.csv') file_dest = { 'url': 'file://{}'.format(dest_file) } move(http_source, file_dest) # verify destination file self.assertTrue(os.path.exists(dest_file)) self.assertEqual(open(dest_file).read(), 'test content')
def test_aekos_traits_to_file_multispecies(self, mock_download_as_file=None): mock_download_as_file.side_effect = self._download_multispecies traits_source = { 'url': 'aekos://traits?speciesName=Abutilon%20fraseri,Abutilon%20halophilum&traitName=height%2ClifeForm&envVarName=aspect%2CelectricalConductivity' } file_dest = { 'url': 'file://{}'.format(self.tmpdir) } move(self.traits_source, file_dest) # Check for these files are created self.assertTrue(os.path.exists( os.path.join(self.tmpdir, 'aekos_dataset.json'))) self.assertTrue(os.path.exists(os.path.join( self.tmpdir, 'aekos_traits_env.zip'))) self.assertTrue(os.path.exists(os.path.join( self.tmpdir, 'data', 'aekos_traits_env.csv'))) self.assertTrue(os.path.exists(os.path.join( self.tmpdir, 'data', 'aekos_citation.csv'))) # Check file content self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'data', 'aekos_traits_env.csv'), resource_filename(__name__, 'data/aekos_traits_env_multispecies.csv'))) self.assertTrue(filecmp.cmp(os.path.join(self.tmpdir, 'data', 'aekos_citation.csv'), resource_filename(__name__, 'data/aekos_citation_multispecies.csv')))
def pull_occurrences_from_ala(params, dest_url, context, import_multspecies_params): # 1. set progress set_progress('RUNNING', 'Download occurrence dataset from ala', None, context) # 2. Download all the occurrence dataset in the params list results = [] try: item, results = download_occurrence_from_ala(params, context) # This is the zip file path of the occurrence dataset ala_csv = item.get('file').get('url').split('file://')[1] # Add the number of occurrence records to the metadata # TODO: This is a hack. Any better solution. occurrence_csv_filename = os.path.join('data', 'ala_occurrence.csv') if occurrence_csv_filename in item['filemetadata']: # FIXME: copy all occurrence metadata to zip level, for backwards # compatibility... this should go away after we fully support 'layered' # occurrence zips. for key in ('rows', 'headers', 'bounds'): # what about 'species' ? if key in item['filemetadata'][occurrence_csv_filename]['metadata']: item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key] # move data file to destination and build data_url src = build_source('file://{}'.format(ala_csv)) dst = build_destination(os.path.join(dest_url, os.path.basename(ala_csv)), app.conf.get('bccvl', {})) item['file']['url'] = dst['url'] movelib.move(src, dst) # tell importer about new dataset (import it) set_progress("RUNNING", u"Import dataset '{0}' from ALA".format(item['title']), None, context) cleanup_job = import_cleanup_job(dest_url, context) import_job = import_ala_job([item], dest_url, context) import_job.link_error(set_progress_job( "FAILED", u"Import of dataset '{0}' from ALA failed".format(item['title']), None, context)) import_job.link_error(cleanup_job) finish_job = set_progress_job("COMPLETED", u"ALA import '{}' complete".format(item['title']), None, context) # Split multi-species dataset if import_multspecies_params: import_multispecies_job = import_multi_species_csv_job(item.get('file').get('url'), import_multspecies_params['results_dir'], import_multspecies_params['import_context'], context) import_multispecies_job.link_error(set_progress_job( "FAILED", u"Split multi-species dataset '{0}' from ALA failed".format(item['title']), None, context)) import_multispecies_job.link_error(cleanup_job) (import_job | import_multispecies_job | cleanup_job | finish_job).delay() else: (import_job | cleanup_job | finish_job).delay() except Exception as e: set_progress('FAILED', 'Download occurrence dataset from ALA: {}'.format(e), None, context) import_cleanup(dest_url, context) LOG.error('Download from %s to %s failed: %s', params, dest_url, e, exc_info=True) finally: for tmpdir in results: if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir)
def test_file_to_file(self, mock_copy=None): file_dest = { 'url': 'file://{}'.format(self.tmpdir) } move(self.file_source, file_dest) # verify destination file dest_file = os.path.join(self.tmpdir, 'test.csv') self.assertTrue(os.path.exists(dest_file)) self.assertEqual(open(dest_file).read(), pkg_resources.resource_string(__name__, 'data/test.csv'))
def get_files(urllist, userid, conf): """ Download all files fiven in urllist to local tempfile return temp folder location. """ dest = tempfile.mkdtemp(prefix='bccvl_export') for url in urllist: src = build_source(url, userid, conf) dst = build_destination('file://{0}/{1}'.format(dest, os.path.basename(url)), conf) movelib.move(src, dst) return dest
def download_input(move_args): src, dst = move_args['args'] try: # set up the source and destination source = build_source( src, move_args['userid'], app.conf.get('bccvl', {})) destination = build_destination(dst) move(source, destination) except Exception as e: LOG.info('Download from %s to %s failed: %s', src, dst, e) raise LOG.info('Download from %s to %s succeeded.', src, dst)
def move(move_args, context): errmsgs = [] for src, dest in move_args: try: source = build_source(src, context['user']['id'], app.conf.get('bccvl', {})) destination = build_destination(dest, app.conf.get('bccvl', {})) movelib.move(source, destination) except Exception as e: msg = 'Download from %s to %s failed: %s', src, dest, str(e) errmsgs.append(msg) LOG.warn(msg) if errmsgs: raise Exception('Move data failed', errmsgs)
def upload_outputs(args): src, dest, item = args try: # set up the source and destination (src is a local file) source = build_source(src) # TODO: add content_type to destination? (move_lib supports it) destination = build_destination(dest, app.conf.get('bccvl', {})) # Upload the file and then generate metadata move(source, destination) LOG.info('Upload from %s to %s succeeded.', src, dest) item['file']['failed'] = False except Exception: LOG.info('Upload from %s to %s failed', src, dest) item['file']['failed'] = True
def test_swift_to_swift(self, mock_SwiftService=None): mock_swiftservice = mock_SwiftService.return_value mock_swiftservice.upload.return_value = [{'success': True}] # simulate successful upload mock_swiftservice.download.side_effect = self._swift_download move(self.swift_source, self.swift_dest) mock_SwiftService.assert_has_calls([ # init SwiftService mock.call(mock.ANY), mock.call().download('container2', ['test/test2.txt'], {'out_file': mock.ANY}), # init SwiftService mock.call(mock.ANY), # TODO: mock.ANY here is a SwiftUploadObject, can we verify that in more detail? like object name etc... mock.call().upload('container2', [mock.ANY]), ])
def test_swift_to_file(self, mock_SwiftService=None): mock_swiftservice = mock_SwiftService.return_value mock_swiftservice.download.side_effect = self._swift_download file_dest = { 'url': 'file://{}'.format(self.tmpdir) } move(self.swift_source, file_dest) dest_file = os.path.join(self.tmpdir, 'test2.txt') mock_SwiftService.assert_has_calls([ # init SwiftService mock.call(mock.ANY), mock.call().download('container2', ['test/test2.txt'], {'out_file': dest_file}), ]) # assert dest file? self.assertTrue(os.path.exists(dest_file)) self.assertEqual(open(dest_file).read(), 'test content')
def update_metadata(url, filename, contenttype, context): try: set_progress('RUNNING', 'Download {0}'.format(url), None, context) tmpdir = tempfile.mkdtemp() tmpfile = '{}/{}'.format(tmpdir, filename) userid = context.get('user', {}).get('id') settings = app.conf.get('bccvl', {}) src = build_source(url, userid, settings) dst = build_destination('file://{}'.format(tmpfile), settings) movelib.move(src, dst) item = { 'filemetadata': extract_metadata(tmpfile, contenttype) } # Check that there are lon and lat columns # if upload is of type csv, we validate column names as well if contenttype == 'text/csv': if ('headers' not in item['filemetadata'] or 'lat' not in item['filemetadata']['headers'] or 'lon' not in item['filemetadata']['headers']): raise Exception("Missing 'lat'/'lon' column") set_progress('RUNNING', 'Import metadata for {0}'.format(url), None, context) import_job = import_file_metadata_job([item], url, context) import_job.link_error(set_progress_job( "FAILED", "Metadata update failed for {0}".format(url), None, context)) finish_job = set_progress_job( "COMPLETED", 'Metadata update for {} complete'.format(url), None, context) (import_job | finish_job).delay() except Exception as e: set_progress('FAILED', 'Metadata update for {} failed: {}'.format(url, e), None, context) LOG.error('Metadata update for %s failed: %s', url, e, exc_info=True) finally: if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir)
def test_ala_qid_to_file(self, mock_urlretrieve=None): mock_urlretrieve.side_effect = self._urlretrieve # mock urllib.urlretrieve .... # return zip file with data.csv and citation.csv # mock urllib.urlretriev ... # return ala_metadata.json occurrence_url = "http://biocache.ala.org.au/ws/occurrences/index/download" query = "qid:urn:lsid:biodiversity.org.au:afd.taxon:31a9b8b8-4e8f-4343-a15f-2ed24e0bf1ae" qfilter = "zeroCoordinates,badlyFormedBasisOfRecord,detectedOutlier,decimalLatLongCalculationFromEastingNorthingFailed,missingBasisOfRecord,decimalLatLongCalculationFromVerbatimFailed,coordinatesCentreOfCountry,geospatialIssue,coordinatesOutOfRange,speciesOutsideExpertRange,userVerified,processingError,decimalLatLongConverionFailed,coordinatesCentreOfStateProvince,habitatMismatch" email = "*****@*****.**" src_url = 'ala://ala?url={}&query={}&filter={}&email={}'.format(occurrence_url, query, qfilter, email) file_dest = { 'url': 'file://{}'.format(self.tmpdir) } move({'url': src_url}, file_dest) # verify ala calls? self.assertTrue(os.path.exists(os.path.join(self.tmpdir, 'ala_dataset.json'))) self.assertTrue(os.path.exists(os.path.join(self.tmpdir, 'ala_occurrence.zip')))
def test_ala_utf8_move(self, mock_occur, mock_md): def fetch_occur_data(download_url, dest): occur_file = os.path.join(dest, 'ala_occurrence.zip') shutil.copyfile(resource_filename(__name__, 'data.zip'), occur_file) # FIXME: ala.py exploits side effect, zip is being created in _download_metadata_for_lsid, but other methods in the module rely on the enpacked zip being available # FIXME: ala.py alse rezips inside _ala_postprocess again with zipfile.ZipFile(occur_file) as z: z.extractall(dest) return { 'url' : occur_file, 'name': 'ala_occurrence.zip', 'content_type': 'application/zip'} def fetch_meta_data(lsid_list, dest): metadata_file = os.path.join(dest, 'ala_metadata.json') shutil.copyfile(resource_filename(__name__, 'data.json'), metadata_file) return { 'url' : metadata_file, 'name': 'ala_metadata.json', 'content_type': 'application/json'} mock_occur.side_effect = fetch_occur_data mock_md.side_effect = fetch_meta_data tmpdir = tempfile.mkdtemp() try: occurrence_url = "http://biocache.ala.org.au/ws/occurrences/index/download" query = "lsid:urn:lsid:biodiversity.org.au:apni.taxon:262359" qfilter = "zeroCoordinates,badlyFormedBasisOfRecord,detectedOutlier,decimalLatLongCalculationFromEastingNorthingFailed,missingBasisOfRecord,decimalLatLongCalculationFromVerbatimFailed,coordinatesCentreOfCountry,geospatialIssue,coordinatesOutOfRange,speciesOutsideExpertRange,userVerified,processingError,decimalLatLongConverionFailed,coordinatesCentreOfStateProvince,habitatMismatch" email = "*****@*****.**" src_url = 'ala://ala?url={}&query={}&filter={}&email={}'.format(occurrence_url, query, qfilter, email) movelib.move({'url': src_url}, {'url': 'file://{}'.format(tmpdir)}) self.assertEqual(mock_occur.call_count, 1) self.assertEqual(mock_md.call_count, 1) dl_list = os.listdir(tmpdir) # FIXME: data should not be there self.assertEqual(set(dl_list), set(['ala_occurrence.zip', 'ala_dataset.json', 'ala_metadata.json', 'data'])) finally: shutil.rmtree(tmpdir)
def worker(self, move_job): """ Thread worker used to perform a move of data between endpoints. @param move_job: The move job to execute @type move_job: MoveJob """ try: # Need o handle a list of sources self._logger.info("Starting move for job with id %s", move_job.id) move_job.update(status=MoveJob.STATUS_IN_PROGRESS, start_timestamp=datetime.datetime.now()) # source can be just 1 source or a list of sources if isinstance(move_job.source, str): sourcelist = [move_job.source] elif isinstance(move_job.source, list): sourcelist = move_job.source else: raise Exception("Invalid source {1}".format(move_job.source)) # Validate the destination url dest_url = urlparse(move_job.destination) if dest_url.scheme in ("swift+http", "swift+https") and not self._has_credential(): raise Exception("Credential for Nectar swift service is not configured.") # Download all the files from the sources to the destination destination = build_destination(move_job.destination, self._config) for s in sourcelist: source = build_source(s, move_job.userid, self._config) movelib.move(source, destination) move_job.update(status=MoveJob.STATUS_COMPLETE, start_timestamp=datetime.datetime.now()) except Exception as e: # catch any Exception here so that we can properly update the job state reason = "Move has failed for job with id {0}. Reason: {1}".format(move_job.id, str(e)) self._logger.warning(reason) move_job.update(status=MoveJob.STATUS_FAILED, end_timestamp=datetime.datetime.now(), reason=reason)
def import_multi_species_csv(url, results_dir, import_context, context): # url .... source file # results_dir ... folder to place split files into # context ... the context with user and orig dataset try: set_progress('RUNNING', 'Split {0}'.format(url), None, context) # step 1: update main dataset metadata tmpdir = tempfile.mkdtemp() userid = context.get('user', {}).get('id') settings = app.conf.get('bccvl', {}) src = build_source(url, userid, settings) dst = build_destination('file://{}'.format(tmpdir), settings) movelib.move(src, dst) # Get the downloaded filename tmpfile = glob.glob(os.path.join(tmpdir, '*'))[0] # Extract occurrence file from downloaded file mimetype, enc = mimetypes.guess_type(tmpfile) if mimetype == 'application/zip': src_occ_data = os.path.join('data', 'ala_occurrence.csv') with zipfile.ZipFile(tmpfile, 'r') as zipf: occfile = os.path.join(tmpdir, src_occ_data) zipf.extract(src_occ_data, tmpdir) item = { 'filemetadata': extract_metadata(tmpfile, 'application/zip') } occmd = item['filemetadata'].get(src_occ_data, {}).get('metadata', {}) else: # csv file item = { 'filemetadata': extract_metadata(tmpfile, "text/csv") } occfile = tmpfile occmd = item['filemetadata'] # Check that there are lon and lat columns # if upload is of type csv, we validate column names as well if ('headers' not in occmd or 'lat' not in occmd['headers'] or 'lon' not in occmd['headers']): raise Exception("Missing 'lat'/'lon' column") set_progress('RUNNING', 'Import metadata for {0}'.format(url), None, context) import_md_job = import_file_metadata_job([item], url, context) import_md_job.link_error(set_progress_job( "FAILED", "Metadata update failed for {0}".format(url), None, context)) # step 2: split csv file and create sub datasets # start reading csv file and create new datasets which will be # linked up with dataset collection item # FIXME: large csv files should be streamed to seperate files (not read # into ram like here) f = io.open(occfile, 'r', encoding='utf-8', errors='ignore') csvreader = UnicodeCSVReader(f) headers = csvreader.next() if 'species' not in headers: raise Exception('missing species column') speciesidx = headers.index('species') # create dict with all data .... species column used as key, and rest # is just added data = {} for row in csvreader: if not row: continue species = row[speciesidx] if species not in data: # create new entry for species fname = u'{0}.csv'.format(species).replace( u'/', u'_').encode('idna') # TODO: make sure fname contains only legal filename characters fpath = os.path.join(tmpdir, fname) file = io.open(fpath, 'wb') fwriter = UnicodeCSVWriter(file) fwriter.writerow(headers) data[species] = { 'file': file, 'writer': fwriter, 'path': fpath, 'name': fname } data[species]['writer'].writerow(row) # ok we have got all data and everything in separate files # close all files for species in data: data[species]['file'].close() del data[species]['file'] del data[species]['writer'] # extract metadata for species in data: data[species]['filemetadata'] = extract_metadata( data[species]['path'], 'text/csv' ) # send files to destination for species in data: src = build_source('file://{}'.format(data[species]['path'])) dst = build_destination(os.path.join(results_dir, data[species]['name']), app.conf.get('bccvl', {})) data[species]['url'] = dst['url'] movelib.move(src, dst) # all files uploaded .... send import jobs set_progress('RUNNING', 'Create datasets for {0}'.format( url), None, context) items = [] for species in data: # build item item = { 'title': u'{0} occurrences'.format(species), 'description': '', 'file': { 'url': data[species]['url'], 'filename': data[species]['name'], 'contenttype': 'text/csv', }, 'bccvlmetadata': { 'genre': 'DataGenreSpeciesOccurrence', 'categories': ['occurrence'], 'species': { 'scientificName': species, } }, 'filemetadata': data[species]['filemetadata'], '_partof': { # add back reference to orig dataset # TODO: shouldn't use absolute path here 'path': context['context'] } } items.append(item) # start import process start_import = set_progress_job( 'RUNNING', 'Import results', None, context) # What is results_dir being used for? import_job = import_result_job(items, results_dir, import_context) cleanup_job = import_cleanup_job(results_dir, context) import_job.link_error(set_progress_job( 'FAILED', 'Multi species import failed', None, context)) import_job.link_error(cleanup_job) finish_job = set_progress_job( 'COMPLETED', 'Task succeeded', None, context) (start_import | import_md_job | import_job | cleanup_job | finish_job).delay() # FIXME: missing stuff... # need to set multi species collection to finished at some stage except Exception as e: set_progress('FAILED', 'Error while splitting Multi Species CSV {}: {}'.format( url, e), None, context) LOG.error('Multi species split for %s faild: %s', url, e, exc_info=True) finally: if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir)
def pull_occurrences_from_gbif(lsid, dest_url, context): # 1. set progress set_progress('RUNNING', 'Download {0} from gbif'.format(lsid), None, context) # 2. do move src = None dst = None try: tmpdir = tempfile.mkdtemp(prefix='gbif_download_') src = build_source('gbif://gbif?lsid={}'.format(lsid)) dst = build_destination('file://{}'.format(tmpdir)) movelib.move(src, dst) # extract metadata and do other stuff.... set_progress('RUNNING', 'Extract metadata {0} from gbif'.format(lsid), None, context) # open gbif_dateset.json gbif_ds = json.load(open(os.path.join(tmpdir, 'gbif_dataset.json'), 'r')) # collect files inside ds per datatype files = dict(((f['dataset_type'], f) for f in gbif_ds['files'])) # read gbif metadata from attribution file gbif_md = json.load(open(files['attribution']['url'], 'r')) gbif_csv = files['occurrence']['url'] # build bccvl metadata: bccvlmd = { 'genre': 'DataGenreSpeciesOccurrence', 'categories': ['occurrence'], 'species': { 'scientificName': gbif_md.get('scientificName', None), 'vernacularName': gbif_md.get('vernacularName', None), 'taxonID': gbif_md.get('key', None), 'rank': gbif_md.get('rank', None), 'genus': gbif_md.get('genus', None), 'genusGuid': gbif_md.get('genusKey', None), 'family': gbif_md.get('family', None), 'familyGuid': gbif_md.get('familyKey', None), 'order': gbif_md.get('order', None), 'orderGuid': gbif_md.get('orderKey', None), 'clazz': gbif_md.get('class', None), 'clazzGuid': gbif_md.get('classKey', None), 'phylum': gbif_md.get('phylum', None), 'phylumGuid': gbif_md.get('phylumKey', None), 'kingdom': gbif_md.get('kingdom', None), 'kingdomGuid': gbif_md.get('kingdomKey', None) }, } # build item to import item = { 'title': gbif_ds['title'], 'description': gbif_ds['description'], 'file': { 'url': 'file://{}'.format(gbif_csv), # local file url 'contenttype': 'application/zip', 'filename': os.path.basename(gbif_csv) }, 'bccvlmetadata': bccvlmd, 'filemetadata': extract_metadata(gbif_csv, 'application/zip'), } # Add the number of occurrence records to the metadata # TODO: This is a hack. Any better solution. occurrence_csv_filename = os.path.join('data', 'gbif_occurrence.csv') if occurrence_csv_filename in item['filemetadata']: # FIXME: copy all occurrence metadata to zip level, for backwards # compatibility... this should go away after we fully support 'layered' # occurrence zips. for key in ('rows', 'headers', 'bounds'): # what about 'species' ? if key in item['filemetadata'][occurrence_csv_filename]['metadata']: item['filemetadata'][key] = item['filemetadata'][occurrence_csv_filename]['metadata'][key] # move data file to destination and build data_url src = build_source('file://{}'.format(gbif_csv)) dst = build_destination(os.path.join(dest_url, os.path.basename(gbif_csv)), app.conf.get('bccvl', {})) item['file']['url'] = dst['url'] movelib.move(src, dst) # tell importer about new dataset (import it) set_progress('RUNNING', 'Import gbif data {0}'.format(lsid), None, context) cleanup_job = import_cleanup_job(dest_url, context) import_job = import_ala_job([item], dest_url, context) import_job.link_error(set_progress_job("FAILED", "Import of gbif data failed {0}".format(lsid), None, context)) import_job.link_error(cleanup_job) finish_job = set_progress_job("COMPLETED", 'GBIF import {} complete'.format(lsid), None, context) (import_job | cleanup_job | finish_job).delay() except Exception as e: set_progress('FAILED', 'Download {0} from gbif: {1}'.format(lsid, e), None, context) import_cleanup(dest_url, context) LOG.error('Download from %s to %s failed: %s', src, dest_url, e, exc_info=True) finally: if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir)
def export_to_ala(self): uuid = self.request.form.get('uuid', None) try: if uuid: brain = uuidToCatalogBrain(uuid) if brain is None: raise Exception("Brain not found") obj = brain.getObject() else: obj = self.context # get username member = ploneapi.user.get_current() if member.getId(): user = { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } else: raise Exception("Invalid user") # verify dataset if obj.portal_type not in ( 'org.bccvl.content.dataset', 'org.bccvl.content.remotedataset', 'org.bccvl.content.multispeciesdataset'): raise Exception("Invalid UUID (content type)") md = IBCCVLMetadata(obj) if md.get('genre') not in ('DataGenreSpeciesOccurrence', 'DataGenreSpeciesCollection', 'DataGenreTraits'): raise Exception("Invalid UUID (data type)") # get download url dlinfo = IDownloadInfo(obj) # download file from org.bccvl import movelib from org.bccvl.movelib.utils import build_source, build_destination import tempfile destdir = tempfile.mkdtemp(prefix='export_to_ala') try: from org.bccvl.tasks.celery import app settings = app.conf.get('bccvl', {}) dest = os.path.join(destdir, os.path.basename(dlinfo['url'])) movelib.move(build_source(dlinfo['url'], user['id'], settings), build_destination('file://{}'.format(dest))) csvfile = None if dlinfo['contenttype'] == 'application/zip': # loox at 'layers' to find file within zip arc = md['layers'].keys()[0] import zipfile zf = zipfile.ZipFile(dest, 'r') csvfile = zf.open(arc, 'r') else: csvfile = open(dest, 'rb') import requests # "Accept:application/json" "Origin:http://example.com" res = requests.post(settings['ala']['sandboxurl'], files={'file': csvfile}, headers={ 'apikey': settings['ala']['apikey'], 'Accept': 'application/json' }) if res.status_code != 200: self.record_error(res.reason, res.status_code) raise Exception('Upload failed') retval = res.json() # TODO: do error checking # keys: sandboxUrl, fileName, message, error: Bool, fileId return retval finally: import shutil shutil.rmtree(destdir) except Exception as e: self.record_error(str(e), 500) raise
def download_occurrence_from_ala(params, context): results = [] species = [] # a list of species metadata ds_names = [] for dataset in params: src = None dst = None occurrence_url = dataset['url'].rstrip('/') + "/occurrences/index/download" query = dataset['query'] # i.e. qid:<qid> or lsid:<lsid> qfilter = "zeroCoordinates,badlyFormedBasisOfRecord,detectedOutlier,decimalLatLongCalculationFromEastingNorthingFailed,missingBasisOfRecord,decimalLatLongCalculationFromVerbatimFailed,coordinatesCentreOfCountry,geospatialIssue,coordinatesOutOfRange,speciesOutsideExpertRange,userVerified,processingError,decimalLatLongConverionFailed,coordinatesCentreOfStateProvince,habitatMismatch" email = context.get('user', {}).get('email', '') ds_names.append(dataset.get('name', '')) # downlaod occurrence file # TODO: ignore file if not successfully download (exception), but continue?? tmpdir = tempfile.mkdtemp(prefix='ala_download_') results.append(tmpdir) src = build_source('ala://ala?url={}&query={}&filter={}&email={}'.format(occurrence_url, query, qfilter, email)) dst = build_destination('file://{}'.format(tmpdir)) movelib.move(src, dst) # extract metadata and do other stuff.... set_progress('RUNNING', 'Extract metadata for {0} from ala'.format(dataset['query']), None, context) # open ala_dateset.json ala_ds = json.load(open(os.path.join(tmpdir, 'ala_dataset.json'), 'r')) # collect files inside ds per datatype files = dict(((f['dataset_type'], f) for f in ala_ds['files'])) # occurrence data file ala_csv = files['occurrence']['url'] # this is actually a zip file now # read ala metadata from attribution file. # May not have metadata for user uploaded dataset into sandbox if files.get('attribution'): ala_md_list = json.load(open(files['attribution']['url'], 'r')) for md in ala_md_list: species.append({ 'scientificName': md.get('scientificName'), 'vernacularName': md.get('commonNameSingle') or md.get('scientificName'), 'taxonID': md.get('guid'), 'rank': md.get('rank'), 'genus': md.get('genus'), 'family': md.get('family'), 'order': md.get('order'), 'clazz': md.get('classs'), 'phylum': md.get('phylum'), 'kingdom': md.get('kingdom') }) # Shall not happen if len(results) == 0: raise Exception("No occurrence dataset is downloaded from ALA") # Combine all the occurrence and citation files from each download into 1 dataset imported_date = datetime.datetime.now().strftime('%d/%m/%Y') if len(results) > 1: destdir = tempfile.mkdtemp(prefix='ala_download_') results.append(destdir) os.mkdir(os.path.join(destdir, 'data')) combine_csv(results[:-1], 'data/ala_occurrence.csv', destdir) combine_csv(results[:-1], 'data/ala_citation.csv', destdir) # Zip it out and point to the new zip file ala_csv = os.path.join(destdir, 'ala_occurrence.zip') zip_occurrence_data(ala_csv, os.path.join(destdir, 'data'), ['ala_occurrence.csv', 'ala_citation.csv']) # Make a title & description for multispecies dataset ds_name = ', '.join([name for name in ds_names if name]) if ds_name: title = ds_name else: ds_name = ','.join([sp['scientificName'] for sp in species]) title = "{} occurrences".format(ds_name) description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date) else: ds_name = ', '.join([name for name in ds_names if name]) if ds_name: title = ds_name description = "Observed occurrences for {0}, imported from ALA on {1}".format(ds_name, imported_date) else: title = ala_ds['title'] description = ala_ds['description'] species = species[0] # build bccvl metadata: bccvlmd = { 'genre': 'DataGenreSpeciesOccurrence', 'categories': ['occurrence'], 'species': species } # build item to import item = { 'title': title, 'description': description, 'file': { 'url': 'file://{}'.format(ala_csv), # local file url 'contenttype': 'application/zip', 'filename': os.path.basename(ala_csv) }, 'bccvlmetadata': bccvlmd, 'filemetadata': extract_metadata(ala_csv, 'application/zip'), } return (item, results)
def add(self, object): # FIXME: this is a workaround, which is fine for small uploaded files. # large uploads should go through another process anyway # TODO: re implementing this method is the only way to know # the full path of the object. We need the path to apply # the transmogrifier chain. # fti = getUtility(IDexterityFTI, name=self.portal_type) container = aq_inner(self.context) try: # traverse to subfolder if possible container = container.restrictedTraverse('/'.join(self.subpath)) except Exception as e: LOG.warn('Could not traverse to %s/%s', '/'.join(container.getPhysicalPath()), '/'.join(self.subpath)) new_object = addContentToContainer(container, object) # set data genre: if self.datagenre: IBCCVLMetadata(new_object)['genre'] = self.datagenre if self.categories: IBCCVLMetadata(new_object)['categories'] = self.categories new_object.subject = [] if self.domain: new_object.subject = [self.domain] if self.timeperiod: new_object.subject += self.timeperiod # rdf commit should happens in transmogrifier step later on # if fti.immediate_view: # self.immediate_view = "%s/%s/%s" % (container.absolute_url(), new_object.id, fti.immediate_view,) # else: # self.immediate_view = "%s/%s" % (container.absolute_url(), new_object.id) # start background import process (just a metadata update) # run transmogrify md extraction here context_path = '/'.join(new_object.getPhysicalPath()) member = api.user.get_current() # species extract task if IMultiSpeciesDataset.providedBy(new_object): # kick off csv split import tasks import_task = app.signature( "org.bccvl.tasks.datamover.tasks.import_multi_species_csv", kwargs={ 'url': '{}/@@download/file/{}'.format(new_object.absolute_url(), new_object.file.filename), 'results_dir': get_results_dir(new_object, self.request, childSpecies=True), 'import_context': { 'context': '/'.join(container.getPhysicalPath()), 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } }, 'context': { 'context': context_path, 'genre': self.datagenre, 'dataSource': new_object.dataSource, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } }, immutable=True) after_commit_task(import_task) # create job tracking object jt = IJobTracker(new_object) jt.new_job('TODO: generate id', 'generate taskname: import_multi_species_csv', function=new_object.dataSource, type=new_object.portal_type) jt.set_progress('PENDING', u'Multi species import pending') else: if hasattr(self, '_upload'): file = self._upload['file'] new_object.format = file.contentType uid = IUUID(new_object) swiftsettings = getUtility(IRegistry).forInterface( ISwiftSettings) import os.path swift_url = '{storage_url}/{container}/{path}/{name}'.format( storage_url=swiftsettings.storage_url, container=swiftsettings.result_container, path=uid, name=os.path.basename(file.filename)) new_object.remoteUrl = swift_url else: file = new_object.file new_object.format = file.contentType dlinfo = IDownloadInfo(new_object) # single species upload update_task = app.signature( "org.bccvl.tasks.datamover.tasks.update_metadata", kwargs={ 'url': dlinfo['url'], 'filename': dlinfo['filename'], 'contenttype': dlinfo['contenttype'], 'context': { 'context': context_path, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } }, immutable=True) # create upload task in case we upload to external store if hasattr(self, '_upload'): # FIXME: we can't use ssh here.... we don't know which container we are in... and # sshing here is bad as well.... # There is an upload ... we have to make sure the uploaded data ends up in external storage # 3. put temp file aside tmpdir = tempfile.mkdtemp(prefix='bccvl_upload') tmpfile = os.path.join(tmpdir, os.path.basename(file.filename)) blobf = file.open() try: # try rename os.rename(blobf.name, tmpfile) except OSError: # try copy shutil.copy(blobf.name, tmpfile) # TODO: we push the uploaded file directly to swift here.. # this really should be a background process # best solution: ... # user uploads to some temporary upload service (file never ends up here) # we have a remote url here, and tell the datamover to pull it from there # and move it to final destination. (or something like this) # other good way: ... # let user upload directly to swift (what about large file uploads?) # and take care of clean up if necessary # 4. move file to swift # TODO: do we have enough information to upload to swift? # need a temp url? swiftopts = app.conf.get('bccvl', {}).get('swift', {}) src_url = build_source('file://{}'.format(tmpfile)) dest_url = build_destination( 'swift+{}'.format(new_object.remoteUrl), settings={ 'swift': { 'os_auth_url': swiftopts.get('os_auth_url'), 'os_username': swiftopts.get('os_username'), 'os_password': swiftopts.get('os_password'), 'os_project_name': swiftopts.get('os_project_name'), 'os_storage_url': swiftopts.get('os_storage_url'), 'os_user_domain_name': swiftopts.get('os_user_domain_name'), 'os_project_domain_name': swiftopts.get('os_project_domain_name'), 'auth_version': swiftopts.get('auth_version') } }) try: movelib.move(src_url, dest_url) except Exception as e: # do error handling here raise finally: # clean up temp location path = os.path.dirname(tmpfile) shutil.rmtree(path) # queue job submission after_commit_task(update_task) # create job tracking object jt = IJobTracker(new_object) jt.new_job('TODO: generate id', 'generate taskname: update_metadata', function=new_object.dataSource, type=new_object.portal_type) jt.set_progress('PENDING', u'Metadata update pending') # We have to reindex after updating the object new_object.reindexObject()
def fetch_file(request, url): """Dowload the file from url and place it on the local file system. If file is a zip file it will be extracted to the local file system. The method returns the filename of the requested file on the local file system. """ # TODO: optimize data files for mapserver? # reproject/warp source? to avoid mapserver doing warp on the fly # otheroptions: # convert to tiled raster (makes access to tiles faster) # gdal_translate -co TILED=YES original.tif tiled.tif # use Erdas Imagine (HFA) format ... always tiled and supports>4GB files # gdal_translate -of HFA original.tif tiled.img # add overview image to raster (after possible translate) # gdaladdo [-r average] tiled.tif 2 4 8 16 32 64 128 # for rs point data maybe convert to shapefile? if not (url.startswith('http://') or url.startswith('https://')): # TODO: probably allow more than just http and https # and use better exception raise Exception('unsupported url scheme: %s', url) # Check if a local data file is already exist datadir = data_dir(request, url) url, fragment = urlparse.urldefrag(url) # FIXME: have to import here due to circular import from pyramid.settings import asbool with LockFile(datadir + '.lock'): if not os.path.exists(datadir): # the folder doesn't exist so we'll have to fetch the file # TODO: make sure there is no '..' in datadir os.makedirs(datadir) # not available yet so fetch it try: settings = request.registry.settings destfile = os.path.join(datadir, os.path.basename(url)) try: src = { 'url': url, 'verify': asbool(settings.get('bccvl.ssl.verify', True)) } # do we have an __ac cookie? cookie = request.cookies.get('__ac') # get my tokens tokens = ','.join([ token.strip() for token in settings.get( 'authtkt.tokens', '').split('\n') if token.strip() ]) if cookie: src['cookies'] = { 'name': '__ac', 'value': update_auth_cookie(cookie, tokens, request), 'secure': True, 'domain': request.host, 'path': '/' } dst = {'url': u'file://{0}'.format(destfile)} movelib.move(src, dst) except Exception as e: # direct download failed what now? LOG.exception('Failed to download data %s: %s', url, e) raise # if it is a zip we should unpack it # FIXME: do some more robust zip detection if 'application/zip' in mimetypes.guess_type(destfile): with zipfile.ZipFile(destfile, 'r') as zipf: zipf.extractall(datadir) # remove zipfile os.remove(destfile) # search all tifs and try to generate overviews for root, dirnames, filenames in os.walk(datadir): for filename in fnmatch.filter(filenames, '*.tif'): rasterfile = os.path.join(root, filename) ds = gdal.Open(rasterfile) if ds: maxlevel = min(ds.RasterXSize, ds.RasterYSize) / 512 ovrclear = ['gdaladdo', '-clean', rasterfile] ovradd = ['gdaladdo', '-ro', #'--config', 'COMPRESS_OVERVIEW', 'LZW', rasterfile, ] level = 2 while level < maxlevel: ovradd.append(str(level)) level = level * 2 if maxlevel > 2: subprocess.check_call(ovrclear) subprocess.check_call(ovradd) except Exception as e: LOG.error('Could not download %s to %s : %s', url, datadir, e) shutil.rmtree(datadir) raise e # we have the data now construct the filepath filename = fragment if fragment else os.path.basename(url) # FIXME: make sure path.join works correctly (trailing/leading slash?) filename = os.path.join(datadir, filename) # make sure filename is within datadir filename = os.path.normpath(filename) if not os.path.normpath(filename).startswith(datadir): # FIXME: should probably check if filename exists and is supported # and use better exception here raise Exception("Data file path not valid: '%s'", filename) return filename
def export_to_ala(self): uuid = self.request.form.get("uuid", None) try: if uuid: brain = uuidToCatalogBrain(uuid) if brain is None: raise Exception("Brain not found") obj = brain.getObject() else: obj = self.context # get username member = ploneapi.user.get_current() if member.getId(): user = { "id": member.getUserName(), "email": member.getProperty("email"), "fullname": member.getProperty("fullname"), } else: raise Exception("Invalid user") # verify dataset if obj.portal_type not in ( "org.bccvl.content.dataset", "org.bccvl.content.remotedataset", "org.bccvl.content.multispeciesdataset", ): raise Exception("Invalid UUID (content type)") md = IBCCVLMetadata(obj) if md.get("genre") not in ("DataGenreSpeciesOccurrence", "DataGenreTraits"): raise Exception("Invalid UUID (data type)") # get download url dlinfo = IDownloadInfo(obj) # download file from org.bccvl import movelib from org.bccvl.movelib.utils import build_source, build_destination import tempfile destdir = tempfile.mkdtemp(prefix="export_to_ala") try: from org.bccvl.tasks.celery import app settings = app.conf.get("bccvl", {}) dest = os.path.join(destdir, os.path.basename(dlinfo["url"])) movelib.move( build_source(dlinfo["url"], user["id"], settings), build_destination("file://{}".format(dest)) ) csvfile = None if dlinfo["contenttype"] == "application/zip": # loox at 'layers' to find file within zip arc = md["layers"].keys()[0] import zipfile zf = zipfile.ZipFile(dest, "r") csvfile = zf.open(arc, "r") else: csvfile = open(dest, "rb") import requests # "Accept:application/json" "Origin:http://example.com" res = requests.post( settings["ala"]["sandboxurl"], files={"file": csvfile}, headers={"apikey": settings["ala"]["apikey"], "Accept": "application/json"}, ) if res.status_code != 200: self.record_error(res.reason, res.status_code) raise Exception("Upload failed") retval = res.json() # TODO: do error checking # keys: sandboxUrl, fileName, message, error: Bool, fileId return retval finally: import shutil shutil.rmtree(destdir) except Exception as e: self.record_error(str(e), 500) raise
def add(self, object): # FIXME: this is a workaround, which is fine for small uploaded files. # large uploads should go through another process anyway # TODO: re implementing this method is the only way to know # the full path of the object. We need the path to apply # the transmogrifier chain. # fti = getUtility(IDexterityFTI, name=self.portal_type) container = aq_inner(self.context) try: # traverse to subfolder if possible container = container.restrictedTraverse('/'.join(self.subpath)) except Exception as e: LOG.warn('Could not traverse to %s/%s', '/'.join(container.getPhysicalPath()), '/'.join(self.subpath)) new_object = addContentToContainer(container, object) # set data genre: if self.datagenre: IBCCVLMetadata(new_object)['genre'] = self.datagenre if self.categories: IBCCVLMetadata(new_object)['categories'] = self.categories new_object.subject = [] if self.domain: new_object.subject = [self.domain] if self.timeperiod: new_object.subject += self.timeperiod # rdf commit should happens in transmogrifier step later on # if fti.immediate_view: # self.immediate_view = "%s/%s/%s" % (container.absolute_url(), new_object.id, fti.immediate_view,) # else: # self.immediate_view = "%s/%s" % (container.absolute_url(), new_object.id) # start background import process (just a metadata update) # run transmogrify md extraction here context_path = '/'.join(new_object.getPhysicalPath()) member = api.user.get_current() # species extract task if IMultiSpeciesDataset.providedBy(new_object): # kick off csv split import tasks import_task = app.signature( "org.bccvl.tasks.datamover.tasks.import_multi_species_csv", kwargs={ 'url': '{}/@@download/file/{}'.format(new_object.absolute_url(), new_object.file.filename), 'results_dir': get_results_dir(new_object, self.request, childSpecies=True), 'import_context': { 'context': '/'.join(container.getPhysicalPath()), 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } }, 'context': { 'context': context_path, 'genre': self.datagenre, 'dataSource': new_object.dataSource, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } }, immutable=True) after_commit_task(import_task) # create job tracking object jt = IJobTracker(new_object) jt.new_job('TODO: generate id', 'generate taskname: import_multi_species_csv', function=new_object.dataSource, type=new_object.portal_type) jt.set_progress('PENDING', u'Multi species import pending') else: if hasattr(self, '_upload'): file = self._upload['file'] new_object.format = file.contentType uid = IUUID(new_object) swiftsettings = getUtility( IRegistry).forInterface(ISwiftSettings) import os.path swift_url = '{storage_url}/{container}/{path}/{name}'.format( storage_url=swiftsettings.storage_url, container=swiftsettings.result_container, path=uid, name=os.path.basename(file.filename)) new_object.remoteUrl = swift_url else: file = new_object.file new_object.format = file.contentType dlinfo = IDownloadInfo(new_object) # single species upload update_task = app.signature( "org.bccvl.tasks.datamover.tasks.update_metadata", kwargs={ 'url': dlinfo['url'], 'filename': dlinfo['filename'], 'contenttype': dlinfo['contenttype'], 'context': { 'context': context_path, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } }, immutable=True) # create upload task in case we upload to external store if hasattr(self, '_upload'): # FIXME: we can't use ssh here.... we don't know which container we are in... and # sshing here is bad as well.... # There is an upload ... we have to make sure the uploaded data ends up in external storage # 3. put temp file aside tmpdir = tempfile.mkdtemp(prefix='bccvl_upload') tmpfile = os.path.join(tmpdir, os.path.basename(file.filename)) blobf = file.open() try: # try rename os.rename(blobf.name, tmpfile) except OSError: # try copy shutil.copy(blobf.name, tmpfile) # TODO: we push the uploaded file directly to swift here.. # this really should be a background process # best solution: ... # user uploads to some temporary upload service (file never ends up here) # we have a remote url here, and tell the datamover to pull it from there # and move it to final destination. (or something like this) # other good way: ... # let user upload directly to swift (what about large file uploads?) # and take care of clean up if necessary # 4. move file to swift # TODO: do we have enough information to upload to swift? # need a temp url? swiftopts = app.conf.get('bccvl', {}).get('swift', {}) src_url = build_source('file://{}'.format(tmpfile)) dest_url = build_destination('swift+{}'.format(new_object.remoteUrl), settings={'swift': { 'os_auth_url': swiftopts.get('os_auth_url'), 'os_username': swiftopts.get('os_username'), 'os_password': swiftopts.get('os_password'), 'os_tenant_name': swiftopts.get('os_tenant_name'), 'os_storage_url': swiftopts.get('os_storage_url') }} ) try: movelib.move(src_url, dest_url) except Exception as e: # do error handling here raise finally: # clean up temp location path = os.path.dirname(tmpfile) shutil.rmtree(path) # queue job submission after_commit_task(update_task) # create job tracking object jt = IJobTracker(new_object) jt.new_job('TODO: generate id', 'generate taskname: update_metadata', function=new_object.dataSource, type=new_object.portal_type) jt.set_progress('PENDING', u'Metadata update pending') # We have to reindex after updating the object new_object.reindexObject()
def export_to_ala(self): uuid = self.request.form.get('uuid', None) try: if uuid: brain = uuidToCatalogBrain(uuid) if brain is None: raise Exception("Brain not found") obj = brain.getObject() else: obj = self.context # get username member = ploneapi.user.get_current() if member.getId(): user = { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } else: raise Exception("Invalid user") # verify dataset if obj.portal_type not in ('org.bccvl.content.dataset', 'org.bccvl.content.remotedataset', 'org.bccvl.content.multispeciesdataset'): raise Exception("Invalid UUID (content type)") md = IBCCVLMetadata(obj) if md.get('genre') not in ('DataGenreSpeciesOccurrence', 'DataGenreSpeciesCollection', 'DataGenreTraits'): raise Exception("Invalid UUID (data type)") # get download url dlinfo = IDownloadInfo(obj) # download file from org.bccvl import movelib from org.bccvl.movelib.utils import build_source, build_destination import tempfile destdir = tempfile.mkdtemp(prefix='export_to_ala') try: from org.bccvl.tasks.celery import app settings = app.conf.get('bccvl', {}) dest = os.path.join(destdir, os.path.basename(dlinfo['url'])) movelib.move(build_source(dlinfo['url'], user['id'], settings), build_destination('file://{}'.format(dest))) csvfile = None if dlinfo['contenttype'] == 'application/zip': # loox at 'layers' to find file within zip arc = md['layers'].keys()[0] import zipfile zf = zipfile.ZipFile(dest, 'r') csvfile = zf.open(arc, 'r') else: csvfile = open(dest, 'rb') import requests # "Accept:application/json" "Origin:http://example.com" res = requests.post(settings['ala']['sandboxurl'], files={'file': csvfile}, headers={ 'apikey': settings['ala']['apikey'], 'Accept': 'application/json' }) if res.status_code != 200: self.record_error(res.reason, res.status_code) raise Exception('Upload failed') retval = res.json() # TODO: do error checking # keys: sandboxUrl, fileName, message, error: Bool, fileId return retval finally: import shutil shutil.rmtree(destdir) except Exception as e: self.record_error(str(e), 500) raise
def fetch_file(request, url): """Dowload the file from url and place it on the local file system. If file is a zip file it will be extracted to the local file system. The method returns the filename of the requested file on the local file system. """ # TODO: optimize data files for mapserver? # reproject/warp source? to avoid mapserver doing warp on the fly # otheroptions: # convert to tiled raster (makes access to tiles faster) # gdal_translate -co TILED=YES original.tif tiled.tif # use Erdas Imagine (HFA) format ... always tiled and supports>4GB files # gdal_translate -of HFA original.tif tiled.img # add overview image to raster (after possible translate) # gdaladdo [-r average] tiled.tif 2 4 8 16 32 64 128 # for rs point data maybe convert to shapefile? if not (url.startswith('http://') or url.startswith('https://')): # TODO: probably allow more than just http and https # and use better exception raise Exception('unsupported url scheme: %s', url) # Check if a local data file is already exist datadir = data_dir(request, url) url, fragment = urlparse.urldefrag(url) # FIXME: have to import here due to circular import from pyramid.settings import asbool with LockFile(datadir + '.lock'): if not os.path.exists(datadir): # the folder doesn't exist so we'll have to fetch the file # TODO: make sure there is no '..' in datadir os.makedirs(datadir) # not available yet so fetch it try: settings = request.registry.settings destfile = os.path.join(datadir, os.path.basename(url)) try: src = { 'url': url, 'verify': asbool(settings.get('bccvl.ssl.verify', True)) } # do we have an __ac cookie? cookie = request.cookies.get('__ac') # get my tokens tokens = ','.join([ token.strip() for token in settings.get( 'authtkt.tokens', '').split('\n') if token.strip() ]) if cookie: src['cookies'] = { 'name': '__ac', 'value': update_auth_cookie(cookie, tokens, request), 'secure': True, 'domain': request.host, 'path': '/' } dst = {'url': u'file://{0}'.format(destfile)} movelib.move(src, dst) except Exception as e: # direct download failed what now? LOG.exception('Failed to download data %s: %s', url, e) raise # if it is a zip we should unpack it # FIXME: do some more robust zip detection if 'application/zip' in mimetypes.guess_type(destfile): with zipfile.ZipFile(destfile, 'r') as zipf: zipf.extractall(datadir) # remove zipfile os.remove(destfile) # search all tifs and try to generate overviews for root, dirnames, filenames in os.walk(datadir): for filename in fnmatch.filter(filenames, '*.tif'): rasterfile = os.path.join(root, filename) ds = gdal.Open(rasterfile) if ds: maxlevel = min(ds.RasterXSize, ds.RasterYSize) / 512 ovrclear = ['gdaladdo', '-clean', rasterfile] ovradd = [ 'gdaladdo', '-ro', #'--config', 'COMPRESS_OVERVIEW', 'LZW', rasterfile, ] level = 2 while level < maxlevel: ovradd.append(str(level)) level = level * 2 if maxlevel > 2: subprocess.check_call(ovrclear) subprocess.check_call(ovradd) except Exception as e: LOG.error('Could not download %s to %s : %s', url, datadir, e) shutil.rmtree(datadir) raise e # we have the data now construct the filepath filename = fragment if fragment else os.path.basename(url) # FIXME: make sure path.join works correctly (trailing/leading slash?) filename = os.path.join(datadir, filename) # make sure filename is within datadir filename = os.path.normpath(filename) if not os.path.normpath(filename).startswith(datadir): # FIXME: should probably check if filename exists and is supported # and use better exception here raise Exception("Data file path not valid: '%s'", filename) return filename