class DerivaDownload(object): """ """ def __init__(self, server, **kwargs): self.server = server self.hostname = None self.catalog = None self.store = None self.cancelled = False self.output_dir = os.path.abspath(kwargs.get("output_dir", ".")) self.envars = kwargs.get("envars", dict()) self.config = kwargs.get("config") self.credentials = kwargs.get("credentials", dict()) config_file = kwargs.get("config_file") credential_file = kwargs.get("credential_file") self.metadata = dict() self.sessions = dict() info = "%s v%s [Python %s, %s]" % ( self.__class__.__name__, get_installed_version(VERSION), platform.python_version(), platform.platform(aliased=True)) logging.info("Initializing downloader: %s" % info) if not self.server: raise DerivaDownloadConfigurationError("Server not specified!") # server variable initialization self.hostname = self.server.get('host', '') if not self.hostname: raise DerivaDownloadConfigurationError("Host not specified!") protocol = self.server.get('protocol', 'https') self.server_url = protocol + "://" + self.hostname catalog_id = self.server.get("catalog_id", "1") session_config = self.server.get('session') # credential initialization token = kwargs.get("token") oauth2_token = kwargs.get("oauth2_token") username = kwargs.get("username") password = kwargs.get("password") if credential_file: self.credentials = get_credential(self.hostname, credential_file) elif token or oauth2_token or (username and password): self.credentials = format_credential(token=token, oauth2_token=oauth2_token, username=username, password=password) # catalog and file store initialization if self.catalog: del self.catalog self.catalog = ErmrestCatalog(protocol, self.hostname, catalog_id, self.credentials, session_config=session_config) if self.store: del self.store self.store = HatracStore(protocol, self.hostname, self.credentials, session_config=session_config) # init dcctx cid to a default self.set_dcctx_cid(self.__class__.__name__) # process config file if config_file: try: self.config = read_config(config_file) except Exception as e: raise DerivaDownloadConfigurationError(e) def set_dcctx_cid(self, cid): assert cid, "A dcctx cid is required" if self.catalog: self.catalog.dcctx['cid'] = cid if self.store: self.store.dcctx['cid'] = cid def set_config(self, config): self.config = config def set_credentials(self, credentials): self.catalog.set_credentials(credentials, self.hostname) self.store.set_credentials(credentials, self.hostname) self.credentials = credentials def download(self, **kwargs): if not self.config: raise DerivaDownloadConfigurationError( "No configuration specified!") if self.config.get("catalog") is None: raise DerivaDownloadConfigurationError( "Catalog configuration error!") ro_manifest = None ro_author_name = None ro_author_orcid = None remote_file_manifest = os.path.abspath(''.join([ os.path.join(self.output_dir, 'remote-file-manifest_'), str(uuid.uuid4()), ".json" ])) catalog_config = self.config['catalog'] self.envars.update(self.config.get('env', dict())) self.envars.update({"hostname": self.hostname}) # 1. If we don't have a client identity, we need to authenticate identity = kwargs.get("identity") if not identity: try: if not self.credentials: self.set_credentials(get_credential(self.hostname)) logging.info("Validating credentials for host: %s" % self.hostname) attributes = self.catalog.get_authn_session().json() identity = attributes["client"] except HTTPError as he: if he.response.status_code == 404: logging.info( "No existing login session found for host: %s" % self.hostname) except Exception as e: raise DerivaDownloadAuthenticationError( "Unable to validate credentials: %s" % format_exception(e)) wallet = kwargs.get("wallet", {}) # 2. Check for bagging config and initialize bag related variables bag_path = None bag_archiver = None bag_algorithms = None bag_config = self.config.get('bag') create_bag = True if bag_config else False if create_bag: bag_name = bag_config.get( 'bag_name', ''.join([ "deriva_bag", '_', time.strftime("%Y-%m-%d_%H.%M.%S") ])).format(**self.envars) bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name)) bag_archiver = bag_config.get('bag_archiver') bag_algorithms = bag_config.get('bag_algorithms', ['sha256']) bag_metadata = bag_config.get( 'bag_metadata', {"Internal-Sender-Identifier": "deriva@%s" % self.server_url}) bag_ro = create_bag and stob(bag_config.get('bag_ro', "True")) if create_bag: bdb.ensure_bag_path_exists(bag_path) bag = bdb.make_bag(bag_path, algs=bag_algorithms, metadata=bag_metadata) if bag_ro: ro_author_name = bag.info.get( "Contact-Name", None if not identity else identity.get( 'full_name', identity.get('display_name', identity.get('id', None)))) ro_author_orcid = bag.info.get("Contact-Orcid") ro_manifest = ro.init_ro_manifest( author_name=ro_author_name, author_orcid=ro_author_orcid) bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID}) # 3. Process the set of queries by locating, instantiating, and invoking the specified processor(s) outputs = dict() base_path = bag_path if bag_path else self.output_dir for processor in catalog_config['query_processors']: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: query_processor = find_query_processor(processor_name, processor_type) processor = query_processor( self.envars, inputs=outputs, bag=create_bag, catalog=self.catalog, store=self.store, base_path=base_path, processor_params=processor_params, remote_file_manifest=remote_file_manifest, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) if create_bag: bdb.cleanup_bag(bag_path) raise # 4. Execute anything in the transform processing pipeline, if configured transform_processors = self.config.get('transform_processors', []) if transform_processors: for processor in transform_processors: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: transform_processor = find_transform_processor( processor_name, processor_type) processor = transform_processor( self.envars, inputs=outputs, processor_params=processor_params, base_path=base_path, bag=create_bag, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) raise # 5. Create the bag, and archive (serialize) if necessary if create_bag: try: if ro_manifest: ro.write_bag_ro_metadata(ro_manifest, bag_path) if not os.path.isfile(remote_file_manifest): remote_file_manifest = None bdb.make_bag( bag_path, algs=bag_algorithms, remote_file_manifest=remote_file_manifest if (remote_file_manifest and os.path.getsize(remote_file_manifest) > 0) else None, update=True) except Exception as e: logging.fatal("Exception while updating bag manifests: %s" % format_exception(e)) bdb.cleanup_bag(bag_path) raise finally: if remote_file_manifest and os.path.isfile( remote_file_manifest): os.remove(remote_file_manifest) logging.info('Created bag: %s' % bag_path) if bag_archiver is not None: try: archive = bdb.archive_bag(bag_path, bag_archiver.lower()) bdb.cleanup_bag(bag_path) outputs = { os.path.basename(archive): { LOCAL_PATH_KEY: archive } } except Exception as e: logging.error( "Exception while creating data bag archive: %s" % format_exception(e)) raise else: outputs = { os.path.basename(bag_path): { LOCAL_PATH_KEY: bag_path } } # 6. Execute anything in the post processing pipeline, if configured post_processors = self.config.get('post_processors', []) if post_processors: for processor in post_processors: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: post_processor = find_post_processor( processor_name, processor_type) processor = post_processor( self.envars, inputs=outputs, processor_params=processor_params, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) raise return outputs def __del__(self): for session in self.sessions.values(): session.close()
class DerivaDownload(object): """ """ def __init__(self, server, output_dir=None, kwargs=None, config=None, config_file=None, credentials=None, credential_file=None): self.server = server self.hostname = None self.output_dir = output_dir if output_dir else "." self.envars = kwargs if kwargs else dict() self.catalog = None self.store = None self.config = config self.cancelled = False self.credentials = credentials if credentials else dict() self.metadata = dict() self.sessions = dict() info = "%s v%s [Python %s, %s]" % ( self.__class__.__name__, VERSION, platform.python_version(), platform.platform(aliased=True)) logging.info("Initializing downloader: %s" % info) if not self.server: raise RuntimeError("Server not specified!") # server variable initialization self.hostname = self.server.get('host', '') if not self.hostname: raise RuntimeError("Host not specified!") protocol = self.server.get('protocol', 'https') self.server_url = protocol + "://" + self.hostname catalog_id = self.server.get("catalog_id", "1") session_config = self.server.get('session') # credential initialization if credential_file: self.credentials = get_credential(self.hostname, credential_file) # catalog and file store initialization if self.catalog: del self.catalog self.catalog = ErmrestCatalog( protocol, self.hostname, catalog_id, self.credentials, session_config=session_config) if self.store: del self.store self.store = HatracStore( protocol, self.hostname, self.credentials, session_config=session_config) # process config file if config_file and os.path.isfile(config_file): self.config = read_config(config_file) def setConfig(self, config): self.config = config def setCredentials(self, credentials): self.catalog.set_credentials(credentials, self.hostname) self.store.set_credentials(credentials, self.hostname) self.credentials = credentials def download(self, identity=None): if not self.config: raise RuntimeError("No configuration specified!") if self.config.get("catalog") is None: raise RuntimeError("Catalog configuration error!") if not identity: logging.info("Validating credentials") try: if not self.credentials: self.setCredentials(get_credential(self.hostname)) attributes = self.catalog.get_authn_session().json() identity = attributes["client"] except Exception as e: raise RuntimeError("Unable to validate credentials: %s" % format_exception(e)) ro_manifest = None ro_author_name = None ro_author_orcid = None remote_file_manifest = os.path.abspath( ''.join([os.path.join(self.output_dir, 'remote-file-manifest_'), str(uuid.uuid4()), ".json"])) catalog_config = self.config['catalog'] self.envars.update(self.config.get('env', dict())) bag_path = None bag_archiver = None bag_algorithms = None bag_config = self.config.get('bag') create_bag = True if bag_config else False if create_bag: bag_name = bag_config.get('bag_name', ''.join(["deriva_bag", '_', time.strftime("%Y-%m-%d_%H.%M.%S")])) bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name)) bag_archiver = bag_config.get('bag_archiver') bag_algorithms = bag_config.get('bag_algorithms', ['sha256']) bag_metadata = bag_config.get('bag_metadata', {"Internal-Sender-Identifier": "deriva@%s" % self.server_url}) bag_ro = create_bag and stob(bag_config.get('bag_ro', "True")) if create_bag: bdb.ensure_bag_path_exists(bag_path) bag = bdb.make_bag(bag_path, algs=bag_algorithms, metadata=bag_metadata) if bag_ro: ro_author_name = bag.info.get("Contact-Name", identity.get('full_name', identity.get('display_name', identity.get('id', None)))) ro_author_orcid = bag.info.get("Contact-Orcid") ro_manifest = ro.init_ro_manifest(author_name=ro_author_name, author_orcid=ro_author_orcid) bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID}) file_list = list() base_path = bag_path if bag_path else self.output_dir for query in catalog_config['queries']: query_path = query['query_path'] output_format = query['output_format'] output_processor = query.get("output_format_processor") format_args = query.get('output_format_params', None) output_path = query.get('output_path', '') try: download_processor = findProcessor(output_format, output_processor) processor = download_processor(self.envars, bag=create_bag, catalog=self.catalog, store=self.store, query=query_path, base_path=base_path, sub_path=output_path, format_args=format_args, remote_file_manifest=remote_file_manifest, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid) file_list.extend(processor.process()) except Exception as e: logging.error(format_exception(e)) if create_bag: bdb.cleanup_bag(bag_path) raise if create_bag: try: if ro_manifest: ro.write_bag_ro_metadata(ro_manifest, bag_path) if not os.path.isfile(remote_file_manifest): remote_file_manifest = None bdb.make_bag(bag_path, algs=bag_algorithms, remote_file_manifest=remote_file_manifest, update=True) except Exception as e: logging.fatal("Exception while updating bag manifests: %s", format_exception(e)) bdb.cleanup_bag(bag_path) raise finally: if remote_file_manifest and os.path.isfile(remote_file_manifest): os.remove(remote_file_manifest) logging.info('Created bag: %s' % bag_path) if bag_archiver is not None: try: archive = bdb.archive_bag(bag_path, bag_archiver.lower()) bdb.cleanup_bag(bag_path) return [archive] except Exception as e: logging.error("Exception while creating data bag archive:", format_exception(e)) raise else: return [bag_path] return file_list
class MainWindow(QMainWindow): config = None credential = None config_path = None store = None catalog = None identity = None attributes = None server = None tempdir = None progress_update_signal = pyqtSignal(str) use_3D_viewer = False curator_mode = False def __init__(self, config_path=None): super(MainWindow, self).__init__() self.ui = MainWindowUI(self) self.configure(config_path) self.authWindow = EmbeddedAuthWindow( self, config=self.config.get("server"), cookie_persistence=False, authentication_success_callback=self.onLoginSuccess) self.getSession() if not self.identity: self.ui.actionLaunch.setEnabled(False) self.ui.actionRefresh.setEnabled(False) self.ui.actionOptions.setEnabled(False) self.ui.actionLogout.setEnabled(False) def configure(self, config_path): # configure logging self.ui.logTextBrowser.widget.log_update_signal.connect(self.updateLog) self.ui.logTextBrowser.setFormatter( logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) logging.getLogger().addHandler(self.ui.logTextBrowser) logging.getLogger().setLevel(logging.INFO) # configure Ermrest/Hatrac if not config_path: config_path = os.path.join( os.path.expanduser( os.path.normpath("~/.deriva/synapse/synspy-launcher")), "config.json") self.config_path = config_path config = read_config(self.config_path, create_default=True, default=DEFAULT_CONFIG) protocol = config["server"]["protocol"] self.server = config["server"]["host"] catalog_id = config["server"]["catalog_id"] session_config = config.get("session") self.catalog = ErmrestCatalog(protocol, self.server, catalog_id, self.credential, session_config=session_config) self.store = HatracStore(protocol, self.server, self.credential, session_config=session_config) # create working dir (tempdir) self.tempdir = tempfile.mkdtemp(prefix="synspy_") # determine viewer mode self.use_3D_viewer = True if config.get( "viewer_mode", "2d").lower() == "3d" else False # curator mode? curator_mode = config.get("curator_mode") if not curator_mode: config["curator_mode"] = False self.curator_mode = config.get("curator_mode") # save config self.config = config write_config(self.config_path, self.config) def getSession(self): qApp.setOverrideCursor(Qt.WaitCursor) self.updateStatus("Validating session.") queryTask = SessionQueryTask(self.catalog) queryTask.status_update_signal.connect(self.onSessionResult) queryTask.query() def onLoginSuccess(self, **kwargs): self.authWindow.hide() self.credential = kwargs["credential"] self.catalog.set_credentials(self.credential, self.server) self.store.set_credentials(self.credential, self.server) self.getSession() def enableControls(self): self.ui.actionLaunch.setEnabled(True) self.ui.actionRefresh.setEnabled(True) self.ui.actionOptions.setEnabled(self.authWindow.authenticated()) self.ui.actionLogin.setEnabled(not self.authWindow.authenticated()) self.ui.actionLogout.setEnabled(self.authWindow.authenticated()) self.ui.actionExit.setEnabled(True) self.ui.workList.setEnabled(True) def disableControls(self): self.ui.actionLaunch.setEnabled(False) self.ui.actionRefresh.setEnabled(False) self.ui.actionOptions.setEnabled(False) self.ui.actionLogin.setEnabled(False) self.ui.actionLogout.setEnabled(False) self.ui.actionExit.setEnabled(False) self.ui.workList.setEnabled(False) def closeEvent(self, event=None): self.disableControls() self.cancelTasks() shutil.rmtree(self.tempdir) if event: event.accept() def cancelTasks(self): Request.shutdown() self.statusBar().showMessage( "Waiting for background tasks to terminate...") while True: qApp.processEvents() if QThreadPool.globalInstance().waitForDone(10): break self.statusBar().showMessage( "All background tasks terminated successfully") def is_curator(self): for attr in self.attributes: if attr.get('id') == CURATORS: return True return False def displayWorklist(self, worklist): keys = [ "RID", "RCT", "Source Image", "Classifier", "Due Date", "Accepted?", "Status", "URL", "Npz URL", "ZYX Slice", "Segmentation Mode", "Segments URL", "Segments Filtered URL", "Subject", ] self.ui.workList.clear() self.ui.workList.setRowCount(0) self.ui.workList.setColumnCount(0) displayed = [ "RID", "RCT", "Segmentation Mode", "Classifier", "Due Date", "Accepted?", "Status" ] self.ui.workList.setRowCount(len(worklist)) self.ui.workList.setColumnCount(len(keys)) self.ui.workList.removeAction(self.ui.markIncompleteAction) if self.is_curator() and self.curator_mode: self.ui.workList.addAction(self.ui.markIncompleteAction) rows = 0 for row in worklist: value = row.get("Status") if not (value == "analysis pending" or value == "analysis in progress") \ and not (self.is_curator() and self.curator_mode): self.ui.workList.hideRow(rows) cols = 0 for key in keys: item = QTableWidgetItem() if key == "Classifier": value = "%s (%s)" % (row['user'][0]['Full_Name'], row['user'][0]['Display_Name']) item.setData(Qt.UserRole, row['Classifier']) elif key == "URL" or key == "Subject": value = row["source_image"][0].get(key) else: value = row.get(key) if isinstance(value, bool): value = str(value) if isinstance(value, str) and key == 'RCT': value = value.replace( 'T', ' ')[0:19] # drop fractional seconds and TZ if isinstance(value, str): item.setText(value) item.setToolTip(value) self.ui.workList.setItem(rows, cols, item) cols += 1 rows += 1 cols = 0 for key in keys: if key not in displayed: self.ui.workList.hideColumn(cols) cols += 1 self.ui.workList.setHorizontalHeaderLabels(keys) # add header names self.ui.workList.horizontalHeader().setDefaultAlignment( Qt.AlignLeft) # set alignment for col in range(len(displayed)): self.ui.workList.resizeColumnToContents(col) self.ui.workList.sortByColumn(2, Qt.DescendingOrder) def getCacheDir(self): cwd = os.getcwd() cache_dir = os.path.expanduser(self.config.get("cache_dir", cwd)) if not os.path.isdir(cache_dir): try: os.makedirs(cache_dir) except OSError as error: if error.errno != errno.EEXIST: logging.error(format_exception(error)) cache_dir = cwd return cache_dir def downloadCallback(self, **kwargs): status = kwargs.get("progress") if status: self.progress_update_signal.emit(status) return True def uploadCallback(self, **kwargs): completed = kwargs.get("completed") total = kwargs.get("total") file_path = kwargs.get("file_path") if completed and total: file_path = " [%s]" % os.path.basename( file_path) if file_path else "" status = "Uploading file%s: %d%% complete" % ( file_path, round(((completed / total) % 100) * 100)) else: summary = kwargs.get("summary", "") file_path = "Uploaded file: [%s] " % os.path.basename( file_path) if file_path else "" status = file_path # + summary if status: self.progress_update_signal.emit(status) return True def serverProblemMessageBox(self, text, detail): msg = QMessageBox() msg.setIcon(QMessageBox.Warning) msg.setWindowTitle("Confirm Action") msg.setText(text) msg.setInformativeText( detail + "\n\nWould you like to remove this item from the current worklist?" ) msg.setStandardButtons(QMessageBox.Yes | QMessageBox.No) ret = msg.exec_() if ret == QMessageBox.No: return else: row = self.ui.workList.getCurrentTableRow() self.ui.workList.removeRow(row) return def retrieveFiles(self): # if there is an existing segments file, download it first, otherwise just initiate the input file download seg_mode = self.ui.workList.getCurrentTableItemTextByName( "Segmentation Mode") segments_url = self.ui.workList.getCurrentTableItemTextByName( "Segments Filtered URL") if segments_url: segments_filename = 'ROI_%s_%s_only.csv' % ( self.ui.workList.getCurrentTableItemTextByName("RID"), seg_mode) segments_destfile = os.path.abspath( os.path.join(self.tempdir, segments_filename)) self.updateStatus("Downloading file: [%s]" % segments_destfile) downloadTask = FileRetrieveTask(self.store) downloadTask.status_update_signal.connect( self.onRetrieveAnalysisFileResult) self.progress_update_signal.connect(self.updateProgress) downloadTask.retrieve(segments_url, destfile=segments_destfile, progress_callback=self.downloadCallback) else: self.retrieveInputFile() def retrieveInputFile(self): # get the main TIFF file for analysis if not already cached if self.use_3D_viewer: url = self.ui.workList.getCurrentTableItemTextByName("URL") filename = 'Image_%s.ome.tiff' % self.ui.workList.getCurrentTableItemTextByName( "Source Image") else: url = self.ui.workList.getCurrentTableItemTextByName("Npz URL") filename = 'ROI_%s.npz' % self.ui.workList.getCurrentTableItemTextByName( "RID") destfile = os.path.abspath(os.path.join(self.getCacheDir(), filename)) if not url and not self.use_3D_viewer: self.resetUI( "Unable to launch 2D viewer due to missing NPZ file for %s." % self.ui.workList.getCurrentTableItemTextByName("RID")) self.serverProblemMessageBox( "2D viewer requires NPZ data to be present!", "The launcher is currently configured to execute the 2D viewer, which requires NPZ files for input. " + "No NPZ file could be found on the server for this task.") return if not os.path.isfile(destfile): self.updateStatus("Downloading file: [%s]" % destfile) downloadTask = FileRetrieveTask(self.store) downloadTask.status_update_signal.connect( self.onRetrieveInputFileResult) self.progress_update_signal.connect(self.updateProgress) downloadTask.retrieve(url, destfile=destfile, progress_callback=self.downloadCallback) else: self.onRetrieveInputFileResult( True, "The file [%s] already exists" % destfile, None, destfile) def getSubprocessPath(self): executable = "synspy-viewer" if self.use_3D_viewer else "synspy-viewer2d" base_path = None return os.path.normpath(resource_path(executable, base_path)) def executeViewer(self, file_path): self.updateStatus("Executing viewer...") env = os.environ env["SYNSPY_AUTO_DUMP_LOAD"] = "true" env["DUMP_PREFIX"] = "./ROI_%s" % self.ui.workList.getCurrentTableItemTextByName( "RID") env["ZYX_SLICE"] = self.ui.workList.getCurrentTableItemTextByName( "ZYX Slice") env["ZYX_IMAGE_GRID"] = "0.4, 0.26, 0.26" env["SYNSPY_DETECT_NUCLEI"] = str( "nucleic" == self.ui.workList.getCurrentTableItemTextByName( "Segmentation Mode")).lower() output_path = os.path.join(os.path.dirname(self.config_path), "viewer.log") classifier = self.ui.workList.getTableItemByName( self.ui.workList.getCurrentTableRow(), "Classifier").data(Qt.UserRole) viewerTask = ViewerTask(self.getSubprocessPath(), self.identity == classifier, proc_output_path=output_path) viewerTask.status_update_signal.connect(self.onSubprocessExecuteResult) viewerTask.run(file_path, self.tempdir, env) def uploadAnalysisResult(self, update_state): qApp.setOverrideCursor(Qt.WaitCursor) # generate hatrac upload params basename = "ROI_%s" % self.ui.workList.getCurrentTableItemTextByName( "RID") match = r"%s_.*\.csv$" % basename output_files = [ f for f in os.listdir(self.tempdir) if os.path.isfile(os.path.join(self.tempdir, f)) and re.match(match, f) ] if not output_files: self.resetUI( "Could not locate output file from viewer subprocess -- aborting." ) return seg_mode = self.ui.workList.getCurrentTableItemTextByName( "Segmentation Mode") if seg_mode == "synaptic": extension = "_synaptic_only.csv" elif seg_mode == "nucleic": extension = "_nucleic_only.csv" else: self.updateStatus("Unknown segmentation mode \"%s\" -- aborting." % seg_mode) return file_name = basename + extension hatrac_path = HATRAC_UPDATE_URL_TEMPLATE % \ (self.ui.workList.getCurrentTableItemTextByName("Subject"), file_name) file_path = os.path.abspath(os.path.join(self.tempdir, file_name)) # upload to object store self.updateStatus("Uploading file %s to server..." % file_name) self.progress_update_signal.connect(self.updateProgress) uploadTask = FileUploadTask(self.store) uploadTask.status_update_signal.connect(self.onUploadFileResult) uploadTask.upload(hatrac_path, file_path, update_state, callback=self.uploadCallback) def markIncomplete(self): RID = self.ui.workList.getCurrentTableItemTextByName("RID") body = [{"RID": RID, "Status": "analysis in progress"}] self.updateStatus("Updating task status for %s..." % RID) updateTask = CatalogUpdateTask(self.catalog) updateTask.status_update_signal.connect(self.onCatalogUpdateResult) updateTask.update(WORKLIST_STATUS_UPDATE, json=body) @pyqtSlot() def taskTriggered(self): self.ui.logTextBrowser.widget.clear() self.disableControls() @pyqtSlot(str) def updateProgress(self, status): self.statusBar().showMessage(status) @pyqtSlot(str, str) def updateStatus(self, status, detail=None): logging.info(status + ((": %s" % detail) if detail else "")) self.statusBar().showMessage(status) @pyqtSlot(str, str) def resetUI(self, status, detail=None): qApp.restoreOverrideCursor() self.updateStatus(status, detail) self.enableControls() @pyqtSlot(str) def updateLog(self, text): self.ui.logTextBrowser.widget.appendPlainText(text) @pyqtSlot(bool, str, str, object) def onSessionResult(self, success, status, detail, result): qApp.restoreOverrideCursor() if success: self.identity = result["client"]["id"] self.attributes = result["attributes"] display_name = result["client"]["full_name"] self.setWindowTitle( "%s (%s - %s)" % (self.windowTitle(), self.server, display_name)) self.ui.actionLaunch.setEnabled(True) self.ui.actionLogout.setEnabled(True) self.ui.actionLogin.setEnabled(False) if not self.is_curator(): self.curator_mode = self.config["curator_mode"] = False self.on_actionRefresh_triggered() else: self.updateStatus("Login required.") @pyqtSlot() def on_actionLaunch_triggered(self): self.disableControls() qApp.setOverrideCursor(Qt.WaitCursor) # create working dir (tempdir) if self.tempdir: shutil.rmtree(self.tempdir) self.tempdir = tempfile.mkdtemp(prefix="synspy_") self.retrieveFiles() @pyqtSlot(bool, str, str, str) def onRetrieveAnalysisFileResult(self, success, status, detail, file_path): if not success: try: os.remove(file_path) except Exception as e: logging.warning("Unable to remove file [%s]: %s" % (file_path, format_exception(e))) self.resetUI(status, detail) self.serverProblemMessageBox( "Unable to download required input file", "The in-progress analysis file was not downloaded successfully." ) return self.retrieveInputFile() @pyqtSlot(bool, str, str, str) def onRetrieveInputFileResult(self, success, status, detail, file_path): if not success: try: os.remove(file_path) except Exception as e: logging.warning("Unable to remove file [%s]: %s" % (file_path, format_exception(e))) self.resetUI(status, detail) self.serverProblemMessageBox( "Unable to download required input file", "The image input file was not downloaded successfully.") return self.executeViewer(file_path) @pyqtSlot(bool, str, str, bool) def onSubprocessExecuteResult(self, success, status, detail, is_owner): qApp.restoreOverrideCursor() if not success: self.resetUI(status, detail) return if not is_owner or self.curator_mode: self.resetUI(status, detail) return # prompt for save/complete/discard msg = QMessageBox() msg.setIcon(QMessageBox.Information) msg.setWindowTitle("Confirm Action") msg.setText("How would you like to proceed?") msg.setInformativeText( "Select \"Save Progress\" to save your progress and upload the output to the server.\n\n" "Select \"Complete\" to upload the output to the server and mark this task as completed.\n\n" "Select \"Discard\" to abort the process and leave the task state unchanged." ) saveButton = msg.addButton("Save Progress", QMessageBox.ActionRole) completeButton = msg.addButton("Complete", QMessageBox.ActionRole) discardButton = msg.addButton("Discard", QMessageBox.RejectRole) msg.exec_() if msg.clickedButton() == discardButton: self.resetUI("Aborted.") return update_state = None if msg.clickedButton() == saveButton: update_state = ("incomplete", "analysis in progress") elif msg.clickedButton() == completeButton: update_state = ("complete", "analysis complete") self.uploadAnalysisResult(update_state) @pyqtSlot(bool, str, str, object) def onUploadFileResult(self, success, status, detail, result): if not success: self.resetUI(status, detail) self.serverProblemMessageBox( "Unable to upload required file(s)", "One or more required files were not uploaded successfully.") return state = result[0] RID = self.ui.workList.getCurrentTableItemTextByName("RID") body = [{ "RID": RID, "Segments Filtered URL": result[1], "Status": state[1] }] self.updateStatus("Updating task status for %s..." % RID) updateTask = CatalogUpdateTask(self.catalog) updateTask.status_update_signal.connect(self.onCatalogUpdateResult) updateTask.update(WORKLIST_UPDATE, json=body) @pyqtSlot(bool, str, str, object) def onCatalogUpdateResult(self, success, status, detail, result): if not success: self.resetUI(status, detail) self.serverProblemMessageBox( "Unable to update catalog data", "The catalog state was not updated successfully.") return qApp.restoreOverrideCursor() self.on_actionRefresh_triggered() @pyqtSlot() def on_actionRefresh_triggered(self): if not self.identity: self.updateStatus("Unable to get worklist -- not logged in.") return qApp.setOverrideCursor(Qt.WaitCursor) self.disableControls() self.updateStatus("Refreshing worklist...") queryTask = CatalogQueryTask(self.catalog) queryTask.status_update_signal.connect(self.onRefreshResult) if self.is_curator() and self.curator_mode: queryTask.query(WORKLIST_CURATOR_QUERY) else: queryTask.query(WORKLIST_QUERY % urlquote(self.identity, "")) @pyqtSlot(bool, str, str, object) def onRefreshResult(self, success, status, detail, result): if success: self.displayWorklist(result) self.resetUI("Ready.") else: self.resetUI(status, detail) if (self.ui.workList.rowCount() > 0) and self.identity: self.ui.actionLaunch.setEnabled(True) else: self.ui.actionLaunch.setEnabled(False) @pyqtSlot() def on_actionLogin_triggered(self): self.authWindow.show() self.authWindow.login() @pyqtSlot() def on_actionLogout_triggered(self): self.authWindow.logout() self.setWindowTitle("%s %s" % (self.ui.title, synspy_version)) self.ui.workList.clearContents() self.ui.workList.setRowCount(0) self.identity = None self.ui.actionLaunch.setEnabled(False) self.ui.actionLogout.setEnabled(False) self.ui.actionLogin.setEnabled(True) @pyqtSlot() def on_actionHelp_triggered(self): pass @pyqtSlot() def on_actionOptions_triggered(self): OptionsDialog.getOptions(self) @pyqtSlot() def on_actionExit_triggered(self): self.closeEvent() QCoreApplication.quit()
class DerivaUpload(object): """ Base class for upload tasks. Encapsulates a catalog instance and a hatrac store instance and provides some common and reusable functions. This class is not intended to be instantiated directly, but rather extended by a specific implementation. """ DefaultConfigFileName = "config.json" DefaultServerListFileName = "servers.json" DefaultTransferStateFileName = "transfers.json" def __init__(self, config_file=None, credential_file=None, server=None): self.server_url = None self.catalog = None self.store = None self.config = None self.credentials = None self.asset_mappings = None self.transfer_state = dict() self.transfer_state_fp = None self.cancelled = False self.metadata = dict() self.file_list = OrderedDict() self.file_status = OrderedDict() self.skipped_files = set() self.override_config_file = config_file self.override_credential_file = credential_file self.server = self.getDefaultServer() if not server else server self.initialize() def __del__(self): self.cleanupTransferState() def initialize(self, cleanup=False): info = "%s v%s [Python %s, %s]" % (self.__class__.__name__, VERSION, platform.python_version(), platform.platform(aliased=True)) logging.info("Initializing uploader: %s" % info) # cleanup invalidates the current configuration and credentials in addition to clearing internal state if cleanup: self.cleanup() # reset just clears the internal state else: self.reset() if not self.server: logging.warning( "A server was not specified and an internal default has not been set." ) return # server variable initialization protocol = self.server.get('protocol', 'https') host = self.server.get('host', '') self.server_url = protocol + "://" + host catalog_id = self.server.get("catalog_id", "1") session_config = self.server.get('session') # overriden credential initialization if self.override_credential_file: self.credentials = get_credential(host, self.override_config_file) # catalog and file store initialization if self.catalog: del self.catalog self.catalog = ErmrestCatalog(protocol, host, catalog_id, self.credentials, session_config=session_config) if self.store: del self.store self.store = HatracStore(protocol, host, self.credentials, session_config=session_config) # transfer state initialization self.loadTransferState() """ Configuration initialization - this is a bit complex because we allow for: 1. Run-time overriding of the config file location. 2. Sub-classes of this class to bundle their own default configuration files in an arbitrary location. 3. The updating of already deployed configuration files if bundled internal defaults are newer. """ config_file = self.override_config_file if self.override_config_file else None # 1. If we don't already have a valid (i.e., overridden) path to a config file... if not (config_file and os.path.isfile(config_file)): # 2. Get the currently deployed config file path, which could possibly be overridden by subclass config_file = self.getDeployedConfigFilePath() # 3. If the deployed default path is not valid, OR, it is valid AND is older than the bundled default if (not (config_file and os.path.isfile(config_file)) or self.isFileNewer(self.getDefaultConfigFilePath(), self.getDeployedConfigFilePath())): # 4. If we can locate a bundled default config file, if os.path.isfile(self.getDefaultConfigFilePath()): # 4.1 Copy the bundled default config file to the deployment-specific config path copy_config(self.getDefaultConfigFilePath(), config_file) else: # 4.2 Otherwise, fallback to writing a failsafe default based on internal hardcoded settings write_config(config_file, DefaultConfig) # 5. Finally, read the resolved configuration file into a config object self._update_internal_config(read_config(config_file)) def _update_internal_config(self, config): """This updates the internal state of the uploader based on the config. """ self.config = config # uploader initialization from configuration self.asset_mappings = self.config.get('asset_mappings', []) mu.add_types(self.config.get('mime_overrides')) def cancel(self): self.cancelled = True def reset(self): self.metadata.clear() self.file_list.clear() self.file_status.clear() self.skipped_files.clear() self.cancelled = False def cleanup(self): self.reset() self.config = None self.credentials = None self.cleanupTransferState() def setServer(self, server): cleanup = self.server != server self.server = server self.initialize(cleanup) def setCredentials(self, credentials): host = self.server['host'] self.credentials = credentials self.catalog.set_credentials(self.credentials, host) self.store.set_credentials(self.credentials, host) @classmethod def getDefaultServer(cls): servers = cls.getServers() for server in servers: lower = {k.lower(): v for k, v in server.items()} if lower.get("default", False): return server return servers[0] if len(servers) else {} @classmethod def getServers(cls): """ This method must be implemented by subclasses. """ raise NotImplementedError( "This method must be implemented by a subclass.") @classmethod def getVersion(cls): """ This method must be implemented by subclasses. """ raise NotImplementedError( "This method must be implemented by a subclass.") @classmethod def getConfigPath(cls): """ This method must be implemented by subclasses. """ raise NotImplementedError( "This method must be implemented by a subclass.") @classmethod def getDeployedConfigPath(cls): return os.path.expanduser(os.path.normpath(cls.getConfigPath())) def getVersionCompatibility(self): return self.config.get("version_compatibility", list()) def isVersionCompatible(self): compatibility = self.getVersionCompatibility() if len(compatibility) > 0: return vu.is_compatible(self.getVersion(), compatibility) else: return True @classmethod def getFileDisplayName(cls, file_path, asset_mapping=None): return os.path.basename(file_path) @staticmethod def isFileNewer(src, dst): if not (os.path.isfile(src) and os.path.isfile(dst)): return False # This comparison wont work with PyInstaller single-file bundles because the bundle is extracted to a temp dir # and every timestamp for every file in the bundle is reset to the bundle extraction/creation time. if getattr(sys, 'frozen', False): prefix = os.path.sep + "_MEI" if prefix in src: return False src_mtime = os.path.getmtime(os.path.abspath(src)) dst_mtime = os.path.getmtime(os.path.abspath(dst)) return src_mtime > dst_mtime @staticmethod def getFileSize(file_path): return os.path.getsize(file_path) @staticmethod def guessContentType(file_path): return mu.guess_content_type(file_path) @staticmethod def getFileHashes(file_path, hashes=frozenset(['md5'])): return hu.compute_file_hashes(file_path, hashes) @staticmethod def getCatalogTable(asset_mapping, metadata_dict=None): schema_name, table_name = asset_mapping.get('target_table', [None, None]) if not (schema_name and table_name): metadata_dict_lower = { k.lower(): v for k, v in metadata_dict.items() } schema_name = metadata_dict_lower.get("schema") table_name = metadata_dict_lower.get("table") if not (schema_name and table_name): raise ValueError( "Unable to determine target catalog table for asset type.") return '%s:%s' % (urlquote(schema_name), urlquote(table_name)) @staticmethod def interpolateDict(src, dst, allowNone=False): if not (isinstance(src, dict) and isinstance(dst, dict)): raise ValueError( "Invalid input parameter type(s): (src = %s, dst = %s), expected (dict, dict)" % (type(src).__name__, type(dst).__name__)) dst = dst.copy() # prune None values from the src, we don't want those to be replaced with the string 'None' in the dest empty = [k for k, v in src.items() if v is None] for k in empty: del src[k] # perform the string replacement for the values in the destination dict for k, v in dst.items(): try: value = v.format(**src) except KeyError: value = v if value: if value.startswith('{') and value.endswith('}'): value = None dst.update({k: value}) # remove all None valued entries in the dest, if disallowed if not allowNone: empty = [k for k, v in dst.items() if v is None] for k in empty: del dst[k] return dst @staticmethod def pruneDict(src, dst, stringify=True): dst = dst.copy() for k in dst.keys(): value = src.get(k) dst[k] = str(value) if (stringify and value is not None) else value return dst def getCurrentConfigFilePath(self): return self.override_config_file if self.override_config_file else self.getDeployedConfigFilePath( ) def getDefaultConfigFilePath(self): return os.path.normpath( resource_path(os.path.join("conf", self.DefaultConfigFileName))) def getDeployedConfigFilePath(self): return os.path.join(self.getDeployedConfigPath(), self.server.get('host', ''), self.DefaultConfigFileName) def getDeployedTransferStateFilePath(self): return os.path.join(self.getDeployedConfigPath(), self.server.get('host', ''), self.DefaultTransferStateFileName) def getRemoteConfig(self): catalog_config = CatalogConfig.fromcatalog(self.catalog) return catalog_config.annotation_obj( "tag:isrd.isi.edu,2017:bulk-upload") def getUpdatedConfig(self): # if we are using an overridden config file, skip the update check if self.override_config_file: return logging.info("Checking for updated configuration...") remote_config = self.getRemoteConfig() if not remote_config: logging.info( "Remote configuration not present, using default local configuration file." ) return deployed_config_file_path = self.getDeployedConfigFilePath() if os.path.isfile(deployed_config_file_path): current_md5 = hu.compute_file_hashes(deployed_config_file_path, hashes=['md5'])['md5'][0] else: logging.info("Local config not found.") current_md5 = None tempdir = tempfile.mkdtemp(prefix="deriva_upload_") if os.path.exists(tempdir): updated_config_path = os.path.abspath( os.path.join(tempdir, DerivaUpload.DefaultConfigFileName)) with io.open(updated_config_path, 'w', newline='\n', encoding='utf-8') as config: config.write( json.dumps(remote_config, ensure_ascii=False, sort_keys=True, separators=(',', ': '), indent=2)) new_md5 = hu.compute_file_hashes(updated_config_path, hashes=['md5'])['md5'][0] if current_md5 != new_md5: logging.info("Updated configuration found.") config = read_config(updated_config_path) self._update_internal_config(config) else: logging.info("Configuration is up-to-date.") config = None shutil.rmtree(tempdir, ignore_errors=True) return config def getFileStatusAsArray(self): result = list() for key in self.file_status.keys(): item = {"File": key} item.update(self.file_status[key]) result.append(item) return result def validateFile(self, root, path, name): file_path = os.path.normpath(os.path.join(path, name)) asset_group, asset_mapping, groupdict = self.getAssetMapping(file_path) if not asset_mapping: return None return asset_group, asset_mapping, groupdict, file_path def scanDirectory(self, root, abort_on_invalid_input=False): """ :param root: :param abort_on_invalid_input: :return: """ root = os.path.abspath(root) if not os.path.isdir(root): raise ValueError("Invalid directory specified: [%s]" % root) logging.info("Scanning files in directory [%s]..." % root) file_list = OrderedDict() for path, dirs, files in walk(root): for file_name in files: file_path = os.path.normpath(os.path.join(path, file_name)) file_entry = self.validateFile(root, path, file_name) if not file_entry: logging.info( "Skipping file: [%s] -- Invalid file type or directory location." % file_path) self.skipped_files.add(file_path) if abort_on_invalid_input: raise ValueError("Invalid input detected, aborting.") else: asset_group = file_entry[0] group_list = file_list.get(asset_group, []) group_list.append(file_entry) file_list[asset_group] = group_list # make sure that file entries in both self.file_list and self.file_status are ordered by the declared order of # the asset_mapping for the file for group in sorted(file_list.keys()): self.file_list[group] = file_list[group] for file_entry in file_list[group]: file_path = file_entry[3] logging.info("Including file: [%s]." % file_path) status = self.getTransferStateStatus(file_path) if status: self.file_status[file_path] = FileUploadState( UploadState.Paused, status)._asdict() else: self.file_status[file_path] = FileUploadState( UploadState.Pending, "Pending")._asdict() def getAssetMapping(self, file_path): """ :param file_path: :return: """ asset_group = -1 for asset_type in self.asset_mappings: asset_group += 1 groupdict = dict() dir_pattern = asset_type.get('dir_pattern', '') ext_pattern = asset_type.get('ext_pattern', '') file_pattern = asset_type.get('file_pattern', '') path = file_path.replace("\\", "/") if dir_pattern: match = re.search(dir_pattern, path) if not match: logging.debug( "The dir_pattern \"%s\" failed to match the input path [%s]" % (dir_pattern, path)) continue groupdict.update(match.groupdict()) if ext_pattern: match = re.search(ext_pattern, path, re.IGNORECASE) if not match: logging.debug( "The ext_pattern \"%s\" failed to match the input path [%s]" % (ext_pattern, path)) continue groupdict.update(match.groupdict()) if file_pattern: match = re.search(file_pattern, path) if not match: logging.debug( "The file_pattern \"%s\" failed to match the input path [%s]" % (file_pattern, path)) continue groupdict.update(match.groupdict()) return asset_group, asset_type, groupdict return None, None, None def uploadFiles(self, status_callback=None, file_callback=None): for group, assets in self.file_list.items(): for asset_group_num, asset_mapping, groupdict, file_path in assets: if self.cancelled: self.file_status[file_path] = FileUploadState( UploadState.Cancelled, "Cancelled by user")._asdict() continue try: self.file_status[file_path] = FileUploadState( UploadState.Running, "In-progress")._asdict() if status_callback: status_callback() self.uploadFile(file_path, asset_mapping, groupdict, file_callback) self.file_status[file_path] = FileUploadState( UploadState.Success, "Complete")._asdict() except HatracJobPaused: status = self.getTransferStateStatus(file_path) if status: self.file_status[file_path] = FileUploadState( UploadState.Paused, "Paused: %s" % status)._asdict() continue except HatracJobTimeout: status = self.getTransferStateStatus(file_path) if status: self.file_status[file_path] = FileUploadState( UploadState.Timeout, "Timeout")._asdict() continue except HatracJobAborted: self.file_status[file_path] = FileUploadState( UploadState.Aborted, "Aborted by user")._asdict() except: (etype, value, traceback) = sys.exc_info() self.file_status[file_path] = FileUploadState( UploadState.Failed, format_exception(value))._asdict() self.delTransferState(file_path) if status_callback: status_callback() failed_uploads = dict() for key, value in self.file_status.items(): if (value["State"] == UploadState.Failed) or (value["State"] == UploadState.Timeout): failed_uploads[key] = value["Status"] if self.skipped_files: logging.warning( "The following file(s) were skipped because they did not satisfy the matching criteria " "of the configuration:\n\n%s\n" % '\n'.join(sorted(self.skipped_files))) if failed_uploads: logging.warning( "The following file(s) failed to upload due to errors:\n\n%s\n" % '\n'.join([ "%s -- %s" % (key, failed_uploads[key]) for key in sorted(failed_uploads.keys()) ])) raise RuntimeError( "One or more file(s) failed to upload due to errors.") def uploadFile(self, file_path, asset_mapping, match_groupdict, callback=None): """ Primary API subclass function. :param file_path: :param asset_mapping: :param match_groupdict: :param callback: :return: """ logging.info("Processing file: [%s]" % file_path) if asset_mapping.get("asset_type", "file") == "table": self._uploadTable(file_path, asset_mapping, match_groupdict) else: self._uploadAsset(file_path, asset_mapping, match_groupdict, callback) def _uploadAsset(self, file_path, asset_mapping, match_groupdict, callback=None): # 1. Populate metadata by querying the catalog self._queryFileMetadata(file_path, asset_mapping, match_groupdict) # 2. If "create_record_before_upload" specified in asset_mapping, check for an existing record, creating a new # one if necessary. Otherwise delay this logic until after the file upload. record = None if stob(asset_mapping.get("create_record_before_upload", False)): record = self._getFileRecord(asset_mapping) # 3. Perform the Hatrac upload self._getFileHatracMetadata(asset_mapping) hatrac_options = asset_mapping.get("hatrac_options", {}) versioned_uri = \ self._hatracUpload(self.metadata["URI"], file_path, md5=self.metadata.get("md5_base64"), sha256=self.metadata.get("sha256_base64"), content_type=self.guessContentType(file_path), content_disposition=self.metadata.get("content-disposition"), chunked=True, create_parents=stob(hatrac_options.get("create_parents", True)), allow_versioning=stob(hatrac_options.get("allow_versioning", True)), callback=callback) logging.debug("Hatrac upload successful. Result object URI: %s" % versioned_uri) if stob(hatrac_options.get("versioned_uris", True)): self.metadata["URI"] = versioned_uri else: self.metadata["URI"] = versioned_uri.rsplit(":")[0] self.metadata["URI_urlencoded"] = urlquote(self.metadata["URI"]) # 3. Check for an existing record and create a new one if necessary if not record: record = self._getFileRecord(asset_mapping) # 4. Update an existing record, if necessary column_map = asset_mapping.get("column_map", {}) updated_record = self.interpolateDict(self.metadata, column_map) if updated_record != record: logging.info("Updating catalog for file [%s]" % self.getFileDisplayName(file_path)) self._catalogRecordUpdate(self.metadata['target_table'], record, updated_record) def _uploadTable(self, file_path, asset_mapping, match_groupdict, callback=None): if self.cancelled: return None self._initFileMetadata(file_path, asset_mapping, match_groupdict) try: default_columns = asset_mapping.get("default_columns") if not default_columns: default_columns = self.catalog.getDefaultColumns( {}, self.metadata['target_table']) default_param = ( '?defaults=%s' % ','.join(default_columns)) if len(default_columns) > 0 else '' file_ext = self.metadata['file_ext'] if file_ext == 'csv': headers = {'content-type': 'text/csv'} elif file_ext == 'json': headers = {'content-type': 'application/json'} else: raise CatalogCreateError( "Unsupported file type for catalog bulk upload: %s" % file_ext) with open(file_path, "rb") as fp: result = self.catalog.post( '/entity/%s%s' % (self.metadata['target_table'], default_param), fp, headers=headers) return result except: (etype, value, traceback) = sys.exc_info() raise CatalogCreateError(format_exception(value)) def _getFileRecord(self, asset_mapping): """ Helper function that queries the catalog to get a record linked to the asset, or create it if it doesn't exist. :return: the file record """ column_map = asset_mapping.get("column_map", {}) rqt = asset_mapping['record_query_template'] try: path = rqt.format(**self.metadata) except KeyError as e: raise ConfigurationError( "Record query template substitution error: %s" % format_exception(e)) result = self.catalog.get(path).json() if result: self._updateFileMetadata(result[0]) return self.pruneDict(result[0], column_map) else: row = self.interpolateDict(self.metadata, column_map) result = self._catalogRecordCreate(self.metadata['target_table'], row) if result: self._updateFileMetadata(result[0]) return self.interpolateDict(self.metadata, column_map, allowNone=True) def _urlEncodeMetadata(self, safe_overrides=None): urlencoded = dict() if not safe_overrides: safe_overrides = dict() for k, v in self.metadata.items(): if k.endswith("_urlencoded"): continue urlencoded[k + "_urlencoded"] = urlquote(str(v), safe_overrides.get(k, "")) self._updateFileMetadata(urlencoded) def _initFileMetadata(self, file_path, asset_mapping, match_groupdict): self.metadata.clear() self._updateFileMetadata(match_groupdict) self.metadata['target_table'] = self.getCatalogTable( asset_mapping, match_groupdict) self.metadata["file_name"] = self.getFileDisplayName(file_path) self.metadata["file_size"] = self.getFileSize(file_path) self._urlEncodeMetadata( asset_mapping.get("url_encoding_safe_overrides")) def _updateFileMetadata(self, src, strict=False): if not (isinstance(src, dict)): ValueError( "Invalid input parameter type(s): (src = %s), expected (dict)" % type(src).__name__) if strict: for k in src.keys(): if k in UploadMetadataReservedKeyNames: logging.warning( "Context metadata update specified reserved key name [%s], " "ignoring value: %s " % (k, src[k])) del src[k] self.metadata.update(src) def _queryFileMetadata(self, file_path, asset_mapping, match_groupdict): """ Helper function that queries the catalog to get required metadata for a given file/asset """ file_name = self.getFileDisplayName(file_path) logging.info("Computing metadata for file: [%s]." % file_name) self._initFileMetadata(file_path, asset_mapping, match_groupdict) logging.info("Computing checksums for file: [%s]. Please wait..." % file_name) hashes = self.getFileHashes( file_path, asset_mapping.get('checksum_types', ['md5', 'sha256'])) for alg, checksum in hashes.items(): alg = alg.lower() self.metadata[alg] = checksum[0] self.metadata[alg + "_base64"] = checksum[1] for uri in asset_mapping.get("metadata_query_templates", []): try: path = uri.format(**self.metadata) except KeyError as e: raise RuntimeError( "Metadata query template substitution error: %s" % format_exception(e)) result = self.catalog.get(path).json() if result: self._updateFileMetadata(result[0], True) self._urlEncodeMetadata( asset_mapping.get("url_encoding_safe_overrides")) else: raise RuntimeError( "Metadata query did not return any results: %s" % path) self._getFileExtensionMetadata(self.metadata.get("file_ext")) for k, v in asset_mapping.get("column_value_templates", {}).items(): try: self.metadata[k] = v.format(**self.metadata) except KeyError as e: logging.warning( "Column value template substitution error: %s" % format_exception(e)) continue self._urlEncodeMetadata( asset_mapping.get("url_encoding_safe_overrides")) def _getFileExtensionMetadata(self, ext): ext_map = self.config.get("file_ext_mappings", {}) entry = ext_map.get(ext) if entry: self._updateFileMetadata(entry) def _getFileHatracMetadata(self, asset_mapping): try: hatrac_templates = asset_mapping["hatrac_templates"] # URI is required self.metadata["URI"] = hatrac_templates["hatrac_uri"].format( **self.metadata) # overridden content-disposition is optional content_disposition = hatrac_templates.get("content-disposition") self.metadata["content-disposition"] = \ None if not content_disposition else content_disposition.format(**self.metadata) self._urlEncodeMetadata( asset_mapping.get("url_encoding_safe_overrides")) except KeyError as e: raise ConfigurationError("Hatrac template substitution error: %s" % format_exception(e)) def _hatracUpload(self, uri, file_path, md5=None, sha256=None, content_type=None, content_disposition=None, chunked=True, create_parents=True, allow_versioning=True, callback=None): # check if there is already an in-progress transfer for this file, # and if so, that the local file has not been modified since the original upload job was created can_resume = False transfer_state = self.getTransferState(file_path) if transfer_state: content_md5 = transfer_state.get("content-md5") content_sha256 = transfer_state.get("content-sha256") if content_md5 or content_sha256: if (md5 == content_md5) or (sha256 == content_sha256): can_resume = True if transfer_state and can_resume: logging.info( "Resuming upload (%s) of file: [%s] to host %s. Please wait..." % (self.getTransferStateStatus(file_path), file_path, transfer_state.get("host"))) path = transfer_state["target"] job_id = transfer_state['url'].rsplit("/", 1)[1] if not (transfer_state["total"] == transfer_state["completed"]): self.store.put_obj_chunked( path, file_path, job_id, callback=callback, start_chunk=transfer_state["completed"]) return self.store.finalize_upload_job(path, job_id) else: logging.info("Uploading file: [%s] to host %s. Please wait..." % (self.getFileDisplayName(file_path), self.server_url)) return self.store.put_loc(uri, file_path, md5=md5, sha256=sha256, content_type=content_type, content_disposition=content_disposition, chunked=chunked, create_parents=create_parents, allow_versioning=allow_versioning, callback=callback) def _catalogRecordCreate(self, catalog_table, row, default_columns=None): """ :param catalog_table: :param row: :param default_columns: :return: """ if self.cancelled: return None try: missing = self.catalog.validateRowColumns(row, catalog_table) if missing: raise CatalogCreateError( "Unable to update catalog entry because one or more specified columns do not exist in the " "target table: [%s]" % ','.join(missing)) if not default_columns: default_columns = self.catalog.getDefaultColumns( row, catalog_table) default_param = ( '?defaults=%s' % ','.join(default_columns)) if len(default_columns) > 0 else '' # for default in default_columns: # row[default] = None create_uri = '/entity/%s%s' % (catalog_table, default_param) logging.debug( "Attempting catalog record create [%s] with data: %s" % (create_uri, json.dumps(row))) return self.catalog.post(create_uri, json=[row]).json() except: (etype, value, traceback) = sys.exc_info() raise CatalogCreateError(format_exception(value)) def _catalogRecordUpdate(self, catalog_table, old_row, new_row): """ :param catalog_table: :param new_row: :param old_row: :return: """ if self.cancelled: return None try: keys = sorted(list(new_row.keys())) old_keys = sorted(list(old_row.keys())) if keys != old_keys: raise RuntimeError( "Cannot update catalog - " "new row column list and old row column list do not match: New: %s != Old: %s" % (keys, old_keys)) combined_row = { 'o%d' % i: old_row[keys[i]] for i in range(len(keys)) } combined_row.update( {'n%d' % i: new_row[keys[i]] for i in range(len(keys))}) update_uri = '/attributegroup/%s/%s;%s' % (catalog_table, ','.join( ["o%d:=%s" % (i, urlquote(keys[i])) for i in range(len(keys))]), ','.join([ "n%d:=%s" % (i, urlquote(keys[i])) for i in range(len(keys)) ])) logging.debug( "Attempting catalog record update [%s] with data: %s" % (update_uri, json.dumps(combined_row))) return self.catalog.put(update_uri, json=[combined_row]).json() except: (etype, value, traceback) = sys.exc_info() raise CatalogUpdateError(format_exception(value)) def defaultFileCallback(self, **kwargs): completed = kwargs.get("completed") total = kwargs.get("total") file_path = kwargs.get("file_path") file_name = os.path.basename(file_path) if file_path else "" job_info = kwargs.get("job_info", {}) job_info.update() if completed and total: file_name = " [%s]" % file_name job_info.update({ "completed": completed, "total": total, "host": kwargs.get("host") }) status = "Uploading file%s: %d%% complete" % ( file_name, round(((float(completed) / float(total)) % 100) * 100)) self.setTransferState(file_path, job_info) else: summary = kwargs.get("summary", "") file_name = "Uploaded file: [%s] " % file_name status = file_name # + summary if status: # logging.debug(status) pass if self.cancelled: return -1 return True def loadTransferState(self): transfer_state_file_path = self.getDeployedTransferStateFilePath() transfer_state_dir = os.path.dirname(transfer_state_file_path) try: if not os.path.isdir(transfer_state_dir): try: os.makedirs(transfer_state_dir) except OSError as error: if error.errno != errno.EEXIST: raise if not os.path.isfile(transfer_state_file_path): with open(transfer_state_file_path, "w") as tsfp: json.dump(self.transfer_state, tsfp) self.transfer_state_fp = \ open(transfer_state_file_path, 'r+') self.transfer_state = json.load(self.transfer_state_fp, object_pairs_hook=OrderedDict) except Exception as e: logging.warning( "Unable to read transfer state file, transfer checkpointing will not be available. " "Error: %s" % format_exception(e)) def getTransferState(self, file_path): return self.transfer_state.get(file_path) def setTransferState(self, file_path, transfer_state): self.transfer_state[file_path] = transfer_state self.writeTransferState() def delTransferState(self, file_path): transfer_state = self.getTransferState(file_path) if transfer_state: del self.transfer_state[file_path] self.writeTransferState() def writeTransferState(self): if not self.transfer_state_fp: return try: self.transfer_state_fp.seek(0, 0) self.transfer_state_fp.truncate() json.dump(self.transfer_state, self.transfer_state_fp, indent=2) self.transfer_state_fp.flush() except Exception as e: logging.warning("Unable to write transfer state file: %s" % format_exception(e)) def cleanupTransferState(self): if self.transfer_state_fp and not self.transfer_state_fp.closed: try: self.transfer_state_fp.flush() self.transfer_state_fp.close() except Exception as e: logging.warning( "Unable to flush/close transfer state file: %s" % format_exception(e)) def getTransferStateStatus(self, file_path): transfer_state = self.getTransferState(file_path) if transfer_state: return "%d%% complete" % (round( ((float(transfer_state["completed"]) / float(transfer_state["total"])) % 100) * 100)) return None