def get_results_dir(result, request, childSpecies=False): swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings) # swift only if it is remote dataset. For blob and multi-species dataset, store locally. # For other dataset type (including the child species of multispecies), store at swift if possible. do_swift = IRemoteDataset.providedBy(result) or \ ((childSpecies or (not IMultiSpeciesDataset.providedBy(result))) and \ not IBlobDataset.providedBy(result) and \ swiftsettings.storage_url) if do_swift: if swiftsettings.storage_url: results_dir = 'swift+{storage_url}/{container}/{path}/'.format( storage_url=swiftsettings.storage_url, container=swiftsettings.result_container, path=IUUID(result)) else: raise Exception("Remote dataset requires swift url to be set") else: # if swift is not setup we use local storage results_dir = 'scp://{uid}@{ip}:{port}{path}/'.format( uid=pwd.getpwuid(os.getuid()).pw_name, # FIXME: hostname from request is not good enough... # need to get ip or host from plone_worker that does actual # import # store in registry? # (is ok for testing) # ip=get_public_ip(), ip=get_hostname(request), port=os.environ.get('SSH_PORT', 22), path=tempfile.mkdtemp(prefix='result_import_')) return results_dir
def get_results_dir(result, request): swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings) # swift only if it is remote dataset. For blob and multi-species dataset, store locally. # For other dataset type, store at swift if possible. do_swift = IRemoteDataset.providedBy(result) or \ (not IMultiSpeciesDataset.providedBy(result) and \ not IBlobDataset.providedBy(result) and \ swiftsettings.storage_url) if do_swift: if swiftsettings.storage_url: results_dir = 'swift+{storage_url}/{container}/{path}/'.format( storage_url=swiftsettings.storage_url, container=swiftsettings.result_container, path=IUUID(result) ) else: raise Exception("Remote dataset requires swift url to be set") else: # if swift is not setup we use local storage results_dir = 'scp://{uid}@{ip}:{port}{path}/'.format( uid=pwd.getpwuid(os.getuid()).pw_name, # FIXME: hostname from request is not good enough... # need to get ip or host from plone_worker that does actual # import # store in registry? # (is ok for testing) # ip=get_public_ip(), ip=get_hostname(request), port=os.environ.get('SSH_PORT', 22), path=tempfile.mkdtemp(prefix='result_import_') ) return results_dir
def build_ala_import_task(lsid, dataset, request): # creates task chain to import ala dataset """ lsid .. species id context ... a dictionary with keys: - context: path to context object - userid: zope userid """ # we need site-path, context-path and lsid for this job dataset_path = '/'.join(dataset.getPhysicalPath()) member = api.user.get_current() context = { 'context': dataset_path, 'dataSource': dataset.dataSource, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } results_dir = get_results_dir(dataset, request) import_multispecies_params = {} if IMultiSpeciesDataset.providedBy(dataset) and dataset.dataSource in ('ala', 'gbif', 'obis'): container = aq_parent(aq_inner(dataset)) import_multispecies_params = { 'results_dir': get_results_dir(dataset, request, childSpecies=True), 'import_context': { 'context': '/'.join(container.getPhysicalPath()), 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } } if dataset.dataSource == 'gbif': return datamover.pull_occurrences_from_gbif.si(lsid, results_dir, context, import_multispecies_params) elif dataset.dataSource == 'aekos': return datamover.pull_occurrences_from_aekos.si(lsid, results_dir, context) elif dataset.dataSource == 'obis': return datamover.pull_occurrences_from_obis.si(lsid, results_dir, context, import_multispecies_params) else: params = [{ 'query': 'lsid:{}'.format(lsid), 'url': 'http://biocache.ala.org.au/ws' }] return datamover.pull_occurrences_from_ala.si(params, results_dir, context, import_multispecies_params)
def build_ala_import_task(lsid, dataset, request): # creates task chain to import ala dataset """ lsid .. species id context ... a dictionary with keys: - context: path to context object - userid: zope userid """ # we need site-path, context-path and lsid for this job dataset_path = '/'.join(dataset.getPhysicalPath()) member = api.user.get_current() context = { 'context': dataset_path, 'dataSource': dataset.dataSource, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } results_dir = get_results_dir(dataset, request) import_multispecies_params = {} if IMultiSpeciesDataset.providedBy(dataset) and dataset.dataSource in ( 'ala', 'gbif', 'obis'): container = aq_parent(aq_inner(dataset)) import_multispecies_params = { 'results_dir': get_results_dir(dataset, request, childSpecies=True), 'import_context': { 'context': '/'.join(container.getPhysicalPath()), 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } } if dataset.dataSource == 'gbif': return datamover.pull_occurrences_from_gbif.si( lsid, results_dir, context, import_multispecies_params) elif dataset.dataSource == 'aekos': return datamover.pull_occurrences_from_aekos.si( lsid, results_dir, context) elif dataset.dataSource == 'obis': return datamover.pull_occurrences_from_obis.si( lsid, results_dir, context, import_multispecies_params) else: params = [{ 'query': 'lsid:{}'.format(lsid), 'url': 'https://biocache-ws.ala.org.au/ws' }] return datamover.pull_occurrences_from_ala.si( params, results_dir, context, import_multispecies_params)
def build_ala_import_qid_task(params, dataset, request): # creates task chain to import ala dataset """ params .. [{name, qid, url}, ...] context ... a dictionary with keys: - context: path to context object - userid: zope userid """ # we need site-path, context-path and lsid for this job dataset_path = '/'.join(dataset.getPhysicalPath()) member = api.user.get_current() context = { 'context': dataset_path, 'dataSource': dataset.dataSource, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } import_multispecies_params = {} if IMultiSpeciesDataset.providedBy(dataset): container = aq_parent(aq_inner(dataset)) import_multispecies_params = { 'results_dir': get_results_dir(dataset, request, childSpecies=True), 'import_context': { 'context': '/'.join(container.getPhysicalPath()), 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } } results_dir = get_results_dir(dataset, request) task = datamover.pull_occurrences_from_ala.si(params, results_dir, context, import_multispecies_params) return task
def import_ala_data(self): if self.request.get('REQUEST_METHOD', 'GET').upper() != 'POST': self.record_error('Request must be POST', 400) raise BadRequest('Request must be POST') context = None # get import context if ISiteRoot.providedBy(self.context): # we have been called at site root... let's traverse to default # import location context = self.context.restrictedTraverse("/".join( (defaults.DATASETS_FOLDER_ID, defaults.DATASETS_SPECIES_FOLDER_ID, 'ala'))) else: # custom context.... let's use in context = self.context # do user check first member = ploneapi.user.get_current() if member.getId(): user = { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } else: # We need at least a valid user raise Unauthorized("Invalid user") # check permission if not checkPermission('org.bccvl.AddDataset', context): raise Unauthorized("User not allowed in this context") params = self.request.form.get('data') if not params: raise BadRequest("At least on of traits or environ has to be set") if params is None: self.record_error('Bad Request', 400, 'Missing parameter data', {'parameter': 'data'}) if not params: self.record_error('Bad Request', 400, 'Empty parameter data', {'parameter': 'data'}) # TODO: should validate objects inside as well? (or use json schema # validation?) # all good so far # pull dataset from aekos # TODO: get better name here title = params[0].get('name', 'ALA import') # determine dataset type # 1. test if it is a multi species import species = set() for query in params: biocache_url = '{}/occurrences/search'.format(query['url']) query = { 'q': query['query'], 'pageSize': 0, 'limit': 2, 'facets': 'species_guid', 'fq': 'species_guid:*' # skip results without species guid } res = requests.get(biocache_url, params=query) res = res.json() # FIXME: do we need to treat sandbox downloads differently? if res.get('facetResults'): # do we have some results at all? for guid in res['facetResults'][0]['fieldResult']: species.add(guid['label']) if len(species) > 1: portal_type = 'org.bccvl.content.multispeciesdataset' else: portal_type = 'org.bccvl.content.dataset' swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings) if swiftsettings.storage_url: portal_type = 'org.bccvl.content.remotedataset' # create content ds = createContent(portal_type, title=title) ds.dataSource = 'ala' ds.description = u' '.join([title, u' imported from ALA']) ds.import_params = params ds = addContentToContainer(context, ds) md = IBCCVLMetadata(ds) if IMultiSpeciesDataset.providedBy(ds): md['genre'] = 'DataGenreSpeciesCollection' md['categories'] = ['multispecies'] else: # species dataset md['genre'] = 'DataGenreSpeciesOccurrence' md['categories'] = ['occurrence'] # TODO: populate this correctly as well md['species'] = [{'scientificName': 'qid', 'taxonID': 'qid'}] # FIXME: IStatusMessage should not be in API call from Products.statusmessages.interfaces import IStatusMessage IStatusMessage(self.request).add('New Dataset created', type='info') # start import job jt = IExperimentJobTracker(ds) status, message = jt.start_job() # reindex ojebct to make sure everything is up to date ds.reindexObject() # FIXME: IStatutsMessage should not be in API call IStatusMessage(self.request).add(message, type=status) # FIXME: API should not return a redirect # 201: new resource created ... location may point to resource from Products.CMFCore.utils import getToolByName portal = getToolByName(self.context, 'portal_url').getPortalObject() nexturl = portal[defaults.DATASETS_FOLDER_ID].absolute_url() self.request.response.setStatus(201) self.request.response.setHeader('Location', nexturl) # FIXME: should return a nice json representation of success or error return { 'status': status, 'message': message, 'jobid': IJobTracker(ds).get_job().id }
def add(self, object): # FIXME: this is a workaround, which is fine for small uploaded files. # large uploads should go through another process anyway # TODO: re implementing this method is the only way to know # the full path of the object. We need the path to apply # the transmogrifier chain. # fti = getUtility(IDexterityFTI, name=self.portal_type) container = aq_inner(self.context) try: # traverse to subfolder if possible container = container.restrictedTraverse('/'.join(self.subpath)) except Exception as e: LOG.warn('Could not traverse to %s/%s', '/'.join(container.getPhysicalPath()), '/'.join(self.subpath)) new_object = addContentToContainer(container, object) # set data genre: if self.datagenre: IBCCVLMetadata(new_object)['genre'] = self.datagenre if self.categories: IBCCVLMetadata(new_object)['categories'] = self.categories new_object.subject = [] if self.domain: new_object.subject = [self.domain] if self.timeperiod: new_object.subject += self.timeperiod # rdf commit should happens in transmogrifier step later on # if fti.immediate_view: # self.immediate_view = "%s/%s/%s" % (container.absolute_url(), new_object.id, fti.immediate_view,) # else: # self.immediate_view = "%s/%s" % (container.absolute_url(), new_object.id) # start background import process (just a metadata update) # run transmogrify md extraction here context_path = '/'.join(new_object.getPhysicalPath()) member = api.user.get_current() # species extract task if IMultiSpeciesDataset.providedBy(new_object): # kick off csv split import tasks import_task = app.signature( "org.bccvl.tasks.datamover.tasks.import_multi_species_csv", kwargs={ 'url': '{}/@@download/file/{}'.format(new_object.absolute_url(), new_object.file.filename), 'results_dir': get_results_dir(new_object, self.request, childSpecies=True), 'import_context': { 'context': '/'.join(container.getPhysicalPath()), 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } }, 'context': { 'context': context_path, 'genre': self.datagenre, 'dataSource': new_object.dataSource, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } }, immutable=True) after_commit_task(import_task) # create job tracking object jt = IJobTracker(new_object) jt.new_job('TODO: generate id', 'generate taskname: import_multi_species_csv', function=new_object.dataSource, type=new_object.portal_type) jt.set_progress('PENDING', u'Multi species import pending') else: if hasattr(self, '_upload'): file = self._upload['file'] new_object.format = file.contentType uid = IUUID(new_object) swiftsettings = getUtility(IRegistry).forInterface( ISwiftSettings) import os.path swift_url = '{storage_url}/{container}/{path}/{name}'.format( storage_url=swiftsettings.storage_url, container=swiftsettings.result_container, path=uid, name=os.path.basename(file.filename)) new_object.remoteUrl = swift_url else: file = new_object.file new_object.format = file.contentType dlinfo = IDownloadInfo(new_object) # single species upload update_task = app.signature( "org.bccvl.tasks.datamover.tasks.update_metadata", kwargs={ 'url': dlinfo['url'], 'filename': dlinfo['filename'], 'contenttype': dlinfo['contenttype'], 'context': { 'context': context_path, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } }, immutable=True) # create upload task in case we upload to external store if hasattr(self, '_upload'): # FIXME: we can't use ssh here.... we don't know which container we are in... and # sshing here is bad as well.... # There is an upload ... we have to make sure the uploaded data ends up in external storage # 3. put temp file aside tmpdir = tempfile.mkdtemp(prefix='bccvl_upload') tmpfile = os.path.join(tmpdir, os.path.basename(file.filename)) blobf = file.open() try: # try rename os.rename(blobf.name, tmpfile) except OSError: # try copy shutil.copy(blobf.name, tmpfile) # TODO: we push the uploaded file directly to swift here.. # this really should be a background process # best solution: ... # user uploads to some temporary upload service (file never ends up here) # we have a remote url here, and tell the datamover to pull it from there # and move it to final destination. (or something like this) # other good way: ... # let user upload directly to swift (what about large file uploads?) # and take care of clean up if necessary # 4. move file to swift # TODO: do we have enough information to upload to swift? # need a temp url? swiftopts = app.conf.get('bccvl', {}).get('swift', {}) src_url = build_source('file://{}'.format(tmpfile)) dest_url = build_destination( 'swift+{}'.format(new_object.remoteUrl), settings={ 'swift': { 'os_auth_url': swiftopts.get('os_auth_url'), 'os_username': swiftopts.get('os_username'), 'os_password': swiftopts.get('os_password'), 'os_project_name': swiftopts.get('os_project_name'), 'os_storage_url': swiftopts.get('os_storage_url'), 'os_user_domain_name': swiftopts.get('os_user_domain_name'), 'os_project_domain_name': swiftopts.get('os_project_domain_name'), 'auth_version': swiftopts.get('auth_version') } }) try: movelib.move(src_url, dest_url) except Exception as e: # do error handling here raise finally: # clean up temp location path = os.path.dirname(tmpfile) shutil.rmtree(path) # queue job submission after_commit_task(update_task) # create job tracking object jt = IJobTracker(new_object) jt.new_job('TODO: generate id', 'generate taskname: update_metadata', function=new_object.dataSource, type=new_object.portal_type) jt.set_progress('PENDING', u'Metadata update pending') # We have to reindex after updating the object new_object.reindexObject()
def add(self, object): # FIXME: this is a workaround, which is fine for small uploaded files. # large uploads should go through another process anyway # TODO: re implementing this method is the only way to know # the full path of the object. We need the path to apply # the transmogrifier chain. # fti = getUtility(IDexterityFTI, name=self.portal_type) container = aq_inner(self.context) try: # traverse to subfolder if possible container = container.restrictedTraverse('/'.join(self.subpath)) except Exception as e: LOG.warn('Could not traverse to %s/%s', '/'.join(container.getPhysicalPath()), '/'.join(self.subpath)) new_object = addContentToContainer(container, object) # set data genre: if self.datagenre: IBCCVLMetadata(new_object)['genre'] = self.datagenre if self.categories: IBCCVLMetadata(new_object)['categories'] = self.categories # rdf commit should happens in transmogrifier step later on # if fti.immediate_view: # self.immediate_view = "%s/%s/%s" % (container.absolute_url(), new_object.id, fti.immediate_view,) # else: # self.immediate_view = "%s/%s" % (container.absolute_url(), new_object.id) # start background import process (just a metadata update) # run transmogrify md extraction here context_path = '/'.join(new_object.getPhysicalPath()) member = api.user.get_current() # species extract task if IMultiSpeciesDataset.providedBy(new_object): # kick off csv split import tasks import_task = app.signature( "org.bccvl.tasks.datamover.tasks.import_multi_species_csv", kwargs={ 'url': '{}/@@download/file/{}'.format(new_object.absolute_url(), new_object.file.filename), 'results_dir': get_results_dir(container, self.request), 'import_context': { 'context': '/'.join(container.getPhysicalPath()), 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } }, 'context': { 'context': context_path, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } }, immutable=True) after_commit_task(import_task) # create job tracking object jt = IJobTracker(new_object) job = jt.new_job('TODO: generate id', 'generate taskname: import_multi_species_csv') job.type = new_object.portal_type jt.set_progress('PENDING', u'Multi species import pending') else: if hasattr(self, '_upload'): file = self._upload['file'] new_object.format = file.contentType uid = IUUID(new_object) swiftsettings = getUtility( IRegistry).forInterface(ISwiftSettings) import os.path swift_url = '{storage_url}/{container}/{path}/{name}'.format( storage_url=swiftsettings.storage_url, container=swiftsettings.result_container, path=uid, name=os.path.basename(file.filename)) new_object.remoteUrl = swift_url else: file = new_object.file new_object.format = file.contentType dlinfo = IDownloadInfo(new_object) # single species upload update_task = app.signature( "org.bccvl.tasks.datamover.tasks.update_metadata", kwargs={ 'url': dlinfo['url'], 'filename': dlinfo['filename'], 'contenttype': dlinfo['contenttype'], 'context': { 'context': context_path, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } }, immutable=True) # create upload task in case we upload to external store if hasattr(self, '_upload'): # There is an upload ... we have to make sure the uploaded data ends up in external storage # 3. put temp file aside tmpdir = tempfile.mkdtemp(prefix='bccvl_upload') tmpfile = os.path.join(tmpdir, os.path.basename(file.filename)) blobf = file.open() try: # try rename os.rename(blobf.name, tmpfile) except OSError: # try copy shutil.copy(blobf.name, tmpfile) # 4. update task chain src_url = 'scp://{uid}@{ip}:{port}{file}'.format( uid=pwd.getpwuid(os.getuid()).pw_name, ip=get_hostname(self.request), port=os.environ.get('SSH_PORT', 22), file=tmpfile) dest_url = 'swift+{}'.format(new_object.remoteUrl) move_task = app.signature( 'org.bccvl.tasks.datamover.tasks.move', kwargs={ 'move_args': [(src_url, dest_url)], 'context': { 'context': context_path, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } }, immutable=True) cleanup_task = app.signature( 'org.bccvl.tasks.plone.import_cleanup', kwargs={ 'path': os.path.dirname(tmpfile), 'context': { 'context': context_path, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } }, immutable=True) update_task = move_task | update_task | cleanup_task # need some more workflow states here to support e.g. zip file upload (multiple rasters), # give user a chance to better define metadata # make sure update_metadata does not change user edited metadata # -> layer, unit, projection, whatever # FIXME: clean up tmp upload directory as well # queue job submission after_commit_task(update_task) # create job tracking object jt = IJobTracker(new_object) job = jt.new_job('TODO: generate id', 'generate taskname: update_metadata') job.type = new_object.portal_type jt.set_progress('PENDING', u'Metadata update pending') # We have to reindex after updating the object new_object.reindexObject()
def import_ala_data(self): if self.request.get("REQUEST_METHOD", "GET").upper() != "POST": self.record_error("Request must be POST", 400) raise BadRequest("Request must be POST") context = None # get import context if ISiteRoot.providedBy(self.context): # we have been called at site root... let's traverse to default # import location context = self.context.restrictedTraverse( "/".join((defaults.DATASETS_FOLDER_ID, defaults.DATASETS_SPECIES_FOLDER_ID, "ala")) ) else: # custom context.... let's use in context = self.context # do user check first member = ploneapi.user.get_current() if member.getId(): user = { "id": member.getUserName(), "email": member.getProperty("email"), "fullname": member.getProperty("fullname"), } else: # We need at least a valid user raise Unauthorized("Invalid user") # check permission if not checkPermission("org.bccvl.AddDataset", context): raise Unauthorized("User not allowed in this context") params = self.request.form.get("data") if not params: raise BadRequest("At least on of traits or environ has to be set") if params is None: self.record_error("Bad Request", 400, "Missing parameter data", {"parameter": "data"}) if not params: self.record_error("Bad Request", 400, "Empty parameter data", {"parameter": "data"}) # TODO: should validate objects inside as well? (or use json schema # validation?) # all good so far # pull dataset from aekos # TODO: get better name here title = params[0].get("name", "ALA import") # determine dataset type # 1. test if it is a multi species import species = set() for query in params: biocache_url = "{}/occurrences/search".format(query["url"]) query = { "q": query["query"], "pageSize": 0, "limit": 2, "facets": "species_guid", "fq": "species_guid:*", # skip results without species guid } res = requests.get(biocache_url, params=query) res = res.json() # FIXME: do we need to treat sandbox downloads differently? if res["facetResults"]: # do we have some results at all? for guid in res["facetResults"][0]["fieldResult"]: species.add(guid["label"]) if len(species) > 1: portal_type = "org.bccvl.content.multispeciesdataset" else: portal_type = "org.bccvl.content.dataset" swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings) if swiftsettings.storage_url: portal_type = "org.bccvl.content.remotedataset" # create content ds = createContentInContainer(context, portal_type, title=title) ds.dataSource = "ala" ds.description = u" ".join([title, u" imported from ALA"]) ds.import_params = params md = IBCCVLMetadata(ds) if IMultiSpeciesDataset.providedBy(ds): md["genre"] = "DataGenreSpeciesCollection" else: # species dataset md["genre"] = "DataGenreSpeciesOccurrence" md["categories"] = ["occurrence"] # TODO: populate this correctly as well md["species"] = [{"scientificName": "qid", "taxonID": "qid"}] # FIXME: IStatusMessage should not be in API call from Products.statusmessages.interfaces import IStatusMessage IStatusMessage(self.request).add("New Dataset created", type="info") # start import job jt = IExperimentJobTracker(ds) status, message = jt.start_job() # reindex ojebct to make sure everything is up to date ds.reindexObject() # FIXME: IStatutsMessage should not be in API call IStatusMessage(self.request).add(message, type=status) # FIXME: API should not return a redirect # 201: new resource created ... location may point to resource from Products.CMFCore.utils import getToolByName portal = getToolByName(self.context, "portal_url").getPortalObject() nexturl = portal[defaults.DATASETS_FOLDER_ID].absolute_url() self.request.response.setStatus(201) self.request.response.setHeader("Location", nexturl) # FIXME: should return a nice json representation of success or error return {"status": status, "message": message, "jobid": IJobTracker(ds).get_job().id}
def pullOccurrenceFromALA(self, lsid, taxon, dataSrc='ala', common=None): # TODO: check permisions? # 1. create new dataset with taxon, lsid and common name set portal = getToolByName(self.context, 'portal_url').getPortalObject() if dataSrc == 'ala': dscontainer = portal[defaults.DATASETS_FOLDER_ID][ defaults.DATASETS_SPECIES_FOLDER_ID]['ala'] elif dataSrc == 'gbif': dscontainer = portal[defaults.DATASETS_FOLDER_ID][ defaults.DATASETS_SPECIES_FOLDER_ID]['gbif'] elif dataSrc == 'aekos': dscontainer = portal[defaults.DATASETS_FOLDER_ID][ defaults.DATASETS_SPECIES_FOLDER_ID]['aekos'] elif dataSrc == 'obis': dscontainer = portal[defaults.DATASETS_FOLDER_ID][ defaults.DATASETS_SPECIES_FOLDER_ID]['obis'] else: raise BadRequest('Invalid data source {0}'.format(dataSrc)) title = [taxon] if common: title.append(u"({})".format(common)) # determine dataset type # 1. test if it is a multi species import species = set() if dataSrc == 'ala': params = [{ 'query': 'lsid:{}'.format(lsid), 'url': 'https://biocache-ws.ala.org.au/ws' }] for query in params: biocache_url = '{}/occurrences/search'.format(query['url']) query = { 'q': query['query'], 'pageSize': 0, 'limit': 2, 'facets': 'species_guid', 'fq': 'species_guid:*' # skip results without species guid } res = requests.get(biocache_url, params=query) res = res.json() if res.get('facetResults'): # do we have some results at all? for guid in res['facetResults'][0]['fieldResult']: species.add(guid['label']) elif dataSrc == 'gbif': genusChildren_url = 'https://api.gbif.org/v1/species/{}/children?offset=0&limit=40'.format( lsid) res = requests.get(genusChildren_url) res = res.json() if res.get('results'): for sp in res.get('results'): if sp.get('speciesKey'): species.add(sp['speciesKey']) elif dataSrc == 'obis': genusChildren_url = 'https://backend.iobis.org/children/{}'.format( lsid) res = requests.get(genusChildren_url) res = res.json() for sp in res: if sp.get('rank_name', '') != 'Species': continue if sp.get('valid_id'): species.add(sp['valid_id']) if len(species) > 1: portal_type = 'org.bccvl.content.multispeciesdataset' else: swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings) if swiftsettings.storage_url: portal_type = 'org.bccvl.content.remotedataset' else: portal_type = 'org.bccvl.content.dataset' # TODO: make sure we get a better content id that dataset-x title = u' '.join(title) ds = createContent(portal_type, title=title) ds.dataSource = dataSrc # Either ALA or GBIF as source # TODO: add number of occurences to description ds.description = u' '.join( (title, u'imported from', unicode(dataSrc.upper()))) ds = addContentToContainer(dscontainer, ds) md = IBCCVLMetadata(ds) # TODO: provenance ... import url? # FIXME: verify input parameters before adding to graph if IMultiSpeciesDataset.providedBy(ds): md['genre'] = 'DataGenreSpeciesCollection' md['categories'] = ['multispecies'] else: md['genre'] = 'DataGenreSpeciesOccurrence' md['categories'] = ['occurrence'] md['species'] = { 'scientificName': taxon, 'taxonID': lsid, } if common: md['species']['vernacularName'] = common IStatusMessage(self.request).add('New Dataset created', type='info') # 2. create and push alaimport job for dataset # TODO: make this named adapter jt = IExperimentJobTracker(ds) status, message = jt.start_job() # reindex object to make sure everything is up to date ds.reindexObject() # Job submission state notifier IStatusMessage(self.request).add(message, type=status) return (status, message)
def add(self, object): # FIXME: this is a workaround, which is fine for small uploaded files. # large uploads should go through another process anyway # TODO: re implementing this method is the only way to know # the full path of the object. We need the path to apply # the transmogrifier chain. # fti = getUtility(IDexterityFTI, name=self.portal_type) container = aq_inner(self.context) try: # traverse to subfolder if possible container = container.restrictedTraverse('/'.join(self.subpath)) except Exception as e: LOG.warn('Could not traverse to %s/%s', '/'.join(container.getPhysicalPath()), '/'.join(self.subpath)) new_object = addContentToContainer(container, object) # set data genre: if self.datagenre: IBCCVLMetadata(new_object)['genre'] = self.datagenre if self.categories: IBCCVLMetadata(new_object)['categories'] = self.categories new_object.subject = [] if self.domain: new_object.subject = [self.domain] if self.timeperiod: new_object.subject += self.timeperiod # rdf commit should happens in transmogrifier step later on # if fti.immediate_view: # self.immediate_view = "%s/%s/%s" % (container.absolute_url(), new_object.id, fti.immediate_view,) # else: # self.immediate_view = "%s/%s" % (container.absolute_url(), new_object.id) # start background import process (just a metadata update) # run transmogrify md extraction here context_path = '/'.join(new_object.getPhysicalPath()) member = api.user.get_current() # species extract task if IMultiSpeciesDataset.providedBy(new_object): # kick off csv split import tasks import_task = app.signature( "org.bccvl.tasks.datamover.tasks.import_multi_species_csv", kwargs={ 'url': '{}/@@download/file/{}'.format(new_object.absolute_url(), new_object.file.filename), 'results_dir': get_results_dir(new_object, self.request, childSpecies=True), 'import_context': { 'context': '/'.join(container.getPhysicalPath()), 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } }, 'context': { 'context': context_path, 'genre': self.datagenre, 'dataSource': new_object.dataSource, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } }, immutable=True) after_commit_task(import_task) # create job tracking object jt = IJobTracker(new_object) jt.new_job('TODO: generate id', 'generate taskname: import_multi_species_csv', function=new_object.dataSource, type=new_object.portal_type) jt.set_progress('PENDING', u'Multi species import pending') else: if hasattr(self, '_upload'): file = self._upload['file'] new_object.format = file.contentType uid = IUUID(new_object) swiftsettings = getUtility( IRegistry).forInterface(ISwiftSettings) import os.path swift_url = '{storage_url}/{container}/{path}/{name}'.format( storage_url=swiftsettings.storage_url, container=swiftsettings.result_container, path=uid, name=os.path.basename(file.filename)) new_object.remoteUrl = swift_url else: file = new_object.file new_object.format = file.contentType dlinfo = IDownloadInfo(new_object) # single species upload update_task = app.signature( "org.bccvl.tasks.datamover.tasks.update_metadata", kwargs={ 'url': dlinfo['url'], 'filename': dlinfo['filename'], 'contenttype': dlinfo['contenttype'], 'context': { 'context': context_path, 'user': { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } } }, immutable=True) # create upload task in case we upload to external store if hasattr(self, '_upload'): # FIXME: we can't use ssh here.... we don't know which container we are in... and # sshing here is bad as well.... # There is an upload ... we have to make sure the uploaded data ends up in external storage # 3. put temp file aside tmpdir = tempfile.mkdtemp(prefix='bccvl_upload') tmpfile = os.path.join(tmpdir, os.path.basename(file.filename)) blobf = file.open() try: # try rename os.rename(blobf.name, tmpfile) except OSError: # try copy shutil.copy(blobf.name, tmpfile) # TODO: we push the uploaded file directly to swift here.. # this really should be a background process # best solution: ... # user uploads to some temporary upload service (file never ends up here) # we have a remote url here, and tell the datamover to pull it from there # and move it to final destination. (or something like this) # other good way: ... # let user upload directly to swift (what about large file uploads?) # and take care of clean up if necessary # 4. move file to swift # TODO: do we have enough information to upload to swift? # need a temp url? swiftopts = app.conf.get('bccvl', {}).get('swift', {}) src_url = build_source('file://{}'.format(tmpfile)) dest_url = build_destination('swift+{}'.format(new_object.remoteUrl), settings={'swift': { 'os_auth_url': swiftopts.get('os_auth_url'), 'os_username': swiftopts.get('os_username'), 'os_password': swiftopts.get('os_password'), 'os_tenant_name': swiftopts.get('os_tenant_name'), 'os_storage_url': swiftopts.get('os_storage_url') }} ) try: movelib.move(src_url, dest_url) except Exception as e: # do error handling here raise finally: # clean up temp location path = os.path.dirname(tmpfile) shutil.rmtree(path) # queue job submission after_commit_task(update_task) # create job tracking object jt = IJobTracker(new_object) jt.new_job('TODO: generate id', 'generate taskname: update_metadata', function=new_object.dataSource, type=new_object.portal_type) jt.set_progress('PENDING', u'Metadata update pending') # We have to reindex after updating the object new_object.reindexObject()
def pullOccurrenceFromALA(self, lsid, taxon, dataSrc='ala', common=None): # TODO: check permisions? # 1. create new dataset with taxon, lsid and common name set portal = getToolByName(self.context, 'portal_url').getPortalObject() if dataSrc == 'ala': dscontainer = portal[defaults.DATASETS_FOLDER_ID][ defaults.DATASETS_SPECIES_FOLDER_ID]['ala'] elif dataSrc == 'gbif': dscontainer = portal[defaults.DATASETS_FOLDER_ID][ defaults.DATASETS_SPECIES_FOLDER_ID]['gbif'] elif dataSrc == 'aekos': dscontainer = portal[defaults.DATASETS_FOLDER_ID][ defaults.DATASETS_SPECIES_FOLDER_ID]['aekos'] elif dataSrc == 'obis': dscontainer = portal[defaults.DATASETS_FOLDER_ID][ defaults.DATASETS_SPECIES_FOLDER_ID]['obis'] else: raise BadRequest('Invalid data source {0}'.format(dataSrc)) title = [taxon] if common: title.append(u"({})".format(common)) # determine dataset type # 1. test if it is a multi species import species = set() if dataSrc == 'ala': params = [{ 'query': 'lsid:{}'.format(lsid), 'url': 'http://biocache.ala.org.au/ws' }] for query in params: biocache_url = '{}/occurrences/search'.format(query['url']) query = { 'q': query['query'], 'pageSize': 0, 'limit': 2, 'facets': 'species_guid', 'fq': 'species_guid:*' # skip results without species guid } res = requests.get(biocache_url, params=query) res = res.json() if res.get('facetResults'): # do we have some results at all? for guid in res['facetResults'][0]['fieldResult']: species.add(guid['label']) elif dataSrc == 'gbif': genusChildren_url = 'https://api.gbif.org/v1/species/{}/children?offset=0&limit=40'.format(lsid) res = requests.get(genusChildren_url) res = res.json() if res.get('results'): for sp in res.get('results'): if sp.get('speciesKey'): species.add(sp['speciesKey']) elif dataSrc == 'obis': genusChildren_url = 'https://backend.iobis.org/children/{}'.format(lsid) res = requests.get(genusChildren_url) res = res.json() for sp in res: if sp.get('rank_name', '') != 'Species': continue if sp.get('valid_id'): species.add(sp['valid_id']) if len(species) > 1: portal_type = 'org.bccvl.content.multispeciesdataset' else: swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings) if swiftsettings.storage_url: portal_type = 'org.bccvl.content.remotedataset' else: portal_type = 'org.bccvl.content.dataset' # TODO: make sure we get a better content id that dataset-x title = u' '.join(title) ds = createContent(portal_type, title=title) ds.dataSource = dataSrc # Either ALA or GBIF as source # TODO: add number of occurences to description ds.description = u' '.join( (title, u'imported from', unicode(dataSrc.upper())) ) ds = addContentToContainer(dscontainer, ds) md = IBCCVLMetadata(ds) # TODO: provenance ... import url? # FIXME: verify input parameters before adding to graph if IMultiSpeciesDataset.providedBy(ds): md['genre'] = 'DataGenreSpeciesCollection' md['categories'] = ['multispecies'] else: md['genre'] = 'DataGenreSpeciesOccurrence' md['categories'] = ['occurrence'] md['species'] = { 'scientificName': taxon, 'taxonID': lsid, } if common: md['species']['vernacularName'] = common IStatusMessage(self.request).add('New Dataset created', type='info') # 2. create and push alaimport job for dataset # TODO: make this named adapter jt = IExperimentJobTracker(ds) status, message = jt.start_job() # reindex object to make sure everything is up to date ds.reindexObject() # Job submission state notifier IStatusMessage(self.request).add(message, type=status) return (status, message)
def import_ala_data(self): if self.request.get('REQUEST_METHOD', 'GET').upper() != 'POST': self.record_error('Request must be POST', 400) raise BadRequest('Request must be POST') context = None # get import context if ISiteRoot.providedBy(self.context): # we have been called at site root... let's traverse to default # import location context = self.context.restrictedTraverse( "/".join((defaults.DATASETS_FOLDER_ID, defaults.DATASETS_SPECIES_FOLDER_ID, 'ala'))) else: # custom context.... let's use in context = self.context # do user check first member = ploneapi.user.get_current() if member.getId(): user = { 'id': member.getUserName(), 'email': member.getProperty('email'), 'fullname': member.getProperty('fullname') } else: # We need at least a valid user raise Unauthorized("Invalid user") # check permission if not checkPermission('org.bccvl.AddDataset', context): raise Unauthorized("User not allowed in this context") params = self.request.form.get('data') if not params: raise BadRequest("At least on of traits or environ has to be set") if params is None: self.record_error('Bad Request', 400, 'Missing parameter data', {'parameter': 'data'}) if not params: self.record_error('Bad Request', 400, 'Empty parameter data', {'parameter': 'data'}) # TODO: should validate objects inside as well? (or use json schema # validation?) # all good so far # pull dataset from aekos # TODO: get better name here title = params[0].get('name', 'ALA import') # determine dataset type # 1. test if it is a multi species import species = set() for query in params: biocache_url = '{}/occurrences/search'.format(query['url']) query = { 'q': query['query'], 'pageSize': 0, 'limit': 2, 'facets': 'species_guid', 'fq': 'species_guid:*' # skip results without species guid } res = requests.get(biocache_url, params=query) res = res.json() # FIXME: do we need to treat sandbox downloads differently? if res.get('facetResults'): # do we have some results at all? for guid in res['facetResults'][0]['fieldResult']: species.add(guid['label']) # Check of it is trait-data isTrait = any([p.get('trait', 0) for p in params]) if not isTrait and len(species) > 1: portal_type = 'org.bccvl.content.multispeciesdataset' else: portal_type = 'org.bccvl.content.dataset' swiftsettings = getUtility(IRegistry).forInterface(ISwiftSettings) if swiftsettings.storage_url: portal_type = 'org.bccvl.content.remotedataset' # create content ds = createContent(portal_type, title=title) ds.dataSource = 'ala' ds.description = u' '.join([title, u' imported from ALA']) ds.import_params = params ds = addContentToContainer(context, ds) md = IBCCVLMetadata(ds) if IMultiSpeciesDataset.providedBy(ds): md['genre'] = 'DataGenreSpeciesCollection' md['categories'] = ['multispecies'] else: if isTrait: # Trait dataset md['genre'] = 'DataGenreTraits' md['categories'] = ['traits'] else: # species dataset md['genre'] = 'DataGenreSpeciesOccurrence' md['categories'] = ['occurrence'] # TODO: populate this correctly as well md['species'] = [{ 'scientificName': 'qid', 'taxonID': 'qid'}] # FIXME: IStatusMessage should not be in API call from Products.statusmessages.interfaces import IStatusMessage IStatusMessage(self.request).add('New Dataset created', type='info') # start import job jt = IExperimentJobTracker(ds) status, message = jt.start_job() # reindex ojebct to make sure everything is up to date ds.reindexObject() # FIXME: IStatutsMessage should not be in API call IStatusMessage(self.request).add(message, type=status) # FIXME: API should not return a redirect # 201: new resource created ... location may point to resource from Products.CMFCore.utils import getToolByName portal = getToolByName(self.context, 'portal_url').getPortalObject() nexturl = portal[defaults.DATASETS_FOLDER_ID].absolute_url() self.request.response.setStatus(201) self.request.response.setHeader('Location', nexturl) # FIXME: should return a nice json representation of success or error return { 'status': status, 'message': message, 'jobid': IJobTracker(ds).get_job().id }