def main(): tokens = None client = NativeClient(client_id=CLIENT_ID, app_name=APP_NAME) try: # if we already have tokens, load and use them tokens = client.load_tokens(requested_scopes=SCOPES) except: pass if not tokens: # if we need to get tokens, start the Native App authentication process # need to specify that we want refresh tokens tokens = client.login(requested_scopes=SCOPES, refresh_tokens=True) try: client.save_tokens(tokens) except: pass transfer = setup_transfer_client(tokens['transfer.api.globus.org']) try: data = load_data_from_file(DATA_FILE) if len(data) > 0: task_data = data['task'] task = transfer.get_task(task_data['task_id']) if task['status'] not in PREVIOUS_TASK_RUN_CASES: print('The last transfer status is {}, skipping run...'.format( task['status'] )) sys.exit(1) except KeyError: # Ignore if there is no previous task pass check_endpoint_path(transfer, SOURCE_ENDPOINT, SOURCE_PATH) if CREATE_DESTINATION_FOLDER: create_destination_directory(transfer, DESTINATION_ENDPOINT, DESTINATION_PATH) else: check_endpoint_path(transfer, DESTINATION_ENDPOINT, DESTINATION_PATH) tdata = TransferData( transfer, SOURCE_ENDPOINT, DESTINATION_ENDPOINT, label=TRANSFER_LABEL, sync_level="checksum" ) tdata.add_item(SOURCE_PATH, DESTINATION_PATH, recursive=True) task = transfer.submit_transfer(tdata) save_data_to_file(DATA_FILE, 'task', task.data) print('Transfer has been started from\n {}:{}\nto\n {}:{}'.format( SOURCE_ENDPOINT, SOURCE_PATH, DESTINATION_ENDPOINT, DESTINATION_PATH )) url_string = 'https://globus.org/app/transfer?' + \ six.moves.urllib.parse.urlencode({ 'origin_id': SOURCE_ENDPOINT, 'origin_path': SOURCE_PATH, 'destination_id': DESTINATION_ENDPOINT, 'destination_path': DESTINATION_PATH }) print('Visit the link below to see the changes:\n{}'.format(url_string))
def submit_transfer(): """ - Take the data returned by the Browse Endpoint helper page and make a Globus transfer request. - Send the user to the transfer status page with the task id from the transfer. """ browse_endpoint_form = request.form dirselect = session['form']['dirselect'] selected = session['form']['datasets'] if dirselect: filtered_datasets = [ds for ds in datasets if ds['id'] in selected] else: path = session['form']['path'] myid = session['form']['id'] filtered_datasets = [{'name':name, 'path': path, 'id': myid} for name, path, myid in zip(selected, path, myid) ] transfer_tokens = session['tokens']['transfer.api.globus.org'] authorizer = RefreshTokenAuthorizer( transfer_tokens['refresh_token'], load_portal_client(), access_token=transfer_tokens['access_token'], expires_at=transfer_tokens['expires_at_seconds']) transfer = TransferClient(authorizer=authorizer) source_endpoint_id = app.config['DATASET_ENDPOINT_ID'] source_endpoint_base = app.config['DATASET_ENDPOINT_BASE'] destination_endpoint_id = browse_endpoint_form['endpoint_id'] destination_folder = browse_endpoint_form.get('folder[0]') transfer_data = TransferData(transfer_client=transfer, source_endpoint=source_endpoint_id, destination_endpoint=destination_endpoint_id, label=browse_endpoint_form.get('label')) for ds in filtered_datasets: print("printing ds") print(ds) if dirselect: source_path = source_endpoint_base + ds['path'] else: source_path = source_endpoint_base + ds['path'] + "/" + ds['name'] dest_path = browse_endpoint_form['path'] if destination_folder: dest_path += destination_folder + '/' if dirselect: dest_path += ds['path'] + '/' else: dest_path += ds['path'] + '/' + ds['name'] transfer_data.add_item(source_path=source_path, destination_path=dest_path, recursive=dirselect) transfer.endpoint_autoactivate(source_endpoint_id) transfer.endpoint_autoactivate(destination_endpoint_id) task_id = transfer.submit_transfer(transfer_data)['task_id'] flash('Transfer request submitted successfully. Task ID: ' + task_id) return(redirect(url_for('transfer_status', task_id=task_id)))
def globus_transfer( # noqa: C901 remote_endpoint, remote_path, name, transfer_type, non_blocking=False): """ Read the local globus endpoint UUID from ~/.zstash.ini. If the ini file does not exist, create an ini file with empty values, and try to find the local endpoint UUID based on the FQDN """ ini_path = os.path.expanduser("~/.zstash.ini") ini = configparser.ConfigParser() local_endpoint = None if ini.read(ini_path): if "local" in ini.sections(): local_endpoint = ini["local"].get("globus_endpoint_uuid") else: ini["local"] = {"globus_endpoint_uuid": ""} try: with open(ini_path, "w") as f: ini.write(f) except Exception as e: logger.error(e) sys.exit(1) if not local_endpoint: fqdn = socket.getfqdn() for pattern in regex_endpoint_map.keys(): if re.fullmatch(pattern, fqdn): local_endpoint = regex_endpoint_map.get(pattern) break if not local_endpoint: logger.error( "{} does not have the local Globus endpoint set".format(ini_path)) sys.exit(1) if remote_endpoint.upper() in hpss_endpoint_map.keys(): remote_endpoint = hpss_endpoint_map.get(remote_endpoint.upper()) if transfer_type == "get": src_ep = remote_endpoint src_path = os.path.join(remote_path, name) dst_ep = local_endpoint dst_path = os.path.join(os.getcwd(), name) else: src_ep = local_endpoint src_path = os.path.join(os.getcwd(), name) dst_ep = remote_endpoint dst_path = os.path.join(remote_path, name) subdir = os.path.basename(os.path.normpath(remote_path)) subdir_label = re.sub("[^A-Za-z0-9_ -]", "", subdir) filename = name.split(".")[0] label = subdir_label + " " + filename native_client = NativeClient( client_id="6c1629cf-446c-49e7-af95-323c6412397f", app_name="Zstash", default_scopes= "openid urn:globus:auth:scope:transfer.api.globus.org:all", ) native_client.login(no_local_server=True, refresh_tokens=True) transfer_authorizer = native_client.get_authorizers().get( "transfer.api.globus.org") tc = TransferClient(transfer_authorizer) for ep_id in [src_ep, dst_ep]: r = tc.endpoint_autoactivate(ep_id, if_expires_in=600) if r.get("code") == "AutoActivationFailed": logger.error( "The {} endpoint is not activated or the current activation expires soon. Please go to https://app.globus.org/file-manager/collections/{} and (re)activate the endpoint." .format(ep_id, ep_id)) sys.exit(1) td = TransferData( tc, src_ep, dst_ep, label=label, sync_level="checksum", verify_checksum=True, preserve_timestamp=True, fail_on_quota_errors=True, ) td.add_item(src_path, dst_path) try: task = tc.submit_transfer(td) except TransferAPIError as e: if e.code == "NoCredException": logger.error( "{}. Please go to https://app.globus.org/endpoints and activate the endpoint." .format(e.message)) else: logger.error(e) sys.exit(1) except Exception as e: logger.error("Exception: {}".format(e)) sys.exit(1) if non_blocking: return try: task_id = task.get("task_id") """ A Globus transfer job (task) can be in one of the three states: ACTIVE, SUCCEEDED, FAILED. The script every 20 seconds polls a status of the transfer job (task) from the Globus Transfer service, with 20 second timeout limit. If the task is ACTIVE after time runs out 'task_wait' returns False, and True otherwise. """ while not tc.task_wait(task_id, 20, 20): pass """ The Globus transfer job (task) has been finished (SUCCEEDED or FAILED). Check if the transfer SUCCEEDED or FAILED. """ task = tc.get_task(task_id) if task["status"] == "SUCCEEDED": logger.info( "Globus transfer {}, from {}{} to {}{} succeeded".format( task_id, src_ep, src_path, dst_ep, dst_path)) else: logger.error("Transfer FAILED") except TransferAPIError as e: if e.code == "NoCredException": logger.error( "{}. Please go to https://app.globus.org/endpoints and activate the endpoint." .format(e.message)) else: logger.error(e) sys.exit(1) except Exception as e: logger.error("Exception: {}".format(e)) sys.exit(1)
def __init__( self, endpoint1, endpoint2, label, sync_level="checksum", verify_checksum=False, encrypt_data=False, ): """ Parameters ---------- endpoint1 : :py:class:models.Endpoint The endpoint to transfer from endp sync_level : int or string [default: "checksum"] "exists", "size", "mtime", or "checksum" For compatibility, this can also be 0, 1, 2, or 3 The meanings are as follows: 0, exists Determine whether or not to transfer based on file existence. If the destination file is absent, do the transfer. 1, size Determine whether or not to transfer based on the size of the file. If destination file size does not match the source, do the transfer. 2, mtime Determine whether or not to transfer based on modification times. If source has a newer modififed time than the destination, do the transfer. 3, checksum Determine whether or not to transfer based on checksums of file contents. If source and destination contents differ, as determined by a checksum of their contents, do the transfer. verify_checksum : bool [default: False] When true, after transfer verify that the source and destination file checksums match. If they don't, re-transfer the entire file and keep trying until it succeeds. This will create CPU load on both the origin and destination of the transfer, and may even be a bottleneck if the network speed is high enough. encrypt_data : bool [default: False] When true, all files will be TLS-protected during transfer. """ if not "Endpoint" in str(endpoint1.__class__): raise AttributeError( "Positional argument `endpoint1` expected to be `:py:class:Endpoint`", ", recieved `:py:class:{0} instead".format(type(endpoint1)), ) if not "Endpoint" in str(endpoint2.__class__): raise AttributeError( "Positional argument `endpoint1` expected to be `:py:class:Endpoint`", ", recieved `:py:class:{0} instead".format(type(endpoint1)), ) self.endpoint1 = endpoint1 self.endpoint2 = endpoint2 self.endpoint1.transfer_client.get_submission_id() self.transfer_data = TransferData( self.endpoint1.transfer_client, self.endpoint1.endpoint_id, self.endpoint2.endpoint_id, label=label, sync_level=sync_level, encrypt_data=encrypt_data, ) self.add_transfers = []
async def create_transfer_globus(transferObject: TransferBase, transfer_client: TransferClient, isFolder: bool = False): """This function verifies if globus authentication is present in session""" #transfer_client = await get_transfer_client(request) source = transferObject.source target = transferObject.target source_name = '' source_path = '' # example source: globus://fd9c190c-b824-11e9-98d7-0a63aa6b37da:/gridftp/pub/databases/eva/PRJEB6057/MOCH.population_sites.CHIR1_0.20140307_EVA_ss_IDs.fixed.vcf.gz if source: source_endpoint_id = source.split(':')[1].replace('/', '') source_path = source.split(':')[2] source_path_array = source_path.split('/') source_name = source_path_array[len(source_path_array) - 1] if target: target_endpoint_id = target.split(':')[1].replace('/', '') target_path = target.split(':')[2] if target_path.endswith('/'): if source_name: target_path = target_path + source_name transfer_response = None # source path ends with '/' if source_name == '': isFolder = True if transferObject.options: if 'recursive' in transferObject.options: if transferObject.options['recursive'] == "True": isFolder = True time = datetime.now().strftime("%d-%m-%Y %H-%M-%S") try: tdata = TransferData(transfer_client, source_endpoint_id, target_endpoint_id, label='RDSDS ' + time, sync_level="checksum") if isFolder: tdata.add_item(source_path, target_path, recursive=True) else: tdata.add_item(source_path, target_path) transfer_result = transfer_client.submit_transfer(tdata) transfer_result_json = json.loads(str(transfer_result)) transfer_response = {'globus_response': transfer_result_json} transfer_response['status'] = 200 rdsds_tracking_id = 'globus-' + transfer_result["task_id"] transfer_response['rdsds_tracking_id'] = rdsds_tracking_id transfer_response[ 'globus_status_url'] = 'https://app.globus.org/activity/' + transfer_result[ "task_id"] + '/overview' return transfer_response except GlobusAPIError as e: # Error response from the REST service, check the code and message for # details. return handle_globus_api_error(e) except NetworkError: logging.error(("Network Failure. " "Possibly a firewall or connectivity issue")) raise except GlobusError: logging.exception("Totally unexpected GlobusError!") raise
def transfer_command( batch, sync_level, recursive, destination, source, checksum_algorithm, external_checksum, label, preserve_mtime, verify_checksum, encrypt, submission_id, dry_run, delete, deadline, skip_activation_check, notify, perf_cc, perf_p, perf_pp, perf_udt, ): """ Executor for `globus transfer` """ source_endpoint, cmd_source_path = source dest_endpoint, cmd_dest_path = destination if recursive and batch: raise click.UsageError( ("You cannot use --recursive in addition to --batch. " "Instead, use --recursive on lines of --batch input " "which need it")) if external_checksum and batch: raise click.UsageError( ("You cannot use --external-checksum in addition to --batch. " "Instead, use --external-checksum on lines of --batch input " "which need it")) if recursive and external_checksum: raise click.UsageError( "--recursive and --external-checksum are mutually exclusive") if (cmd_source_path is None or cmd_dest_path is None) and (not batch): raise click.UsageError( "transfer requires either SOURCE_PATH and DEST_PATH or --batch") # because python can't handle multiple **kwargs expansions in a single # call, we need to get a little bit clever # both the performance options (of which there are a few), and the # notification options (also there are a few) have elements which should be # omitted in some cases # notify comes to us clean, perf opts need more care # put them together into a dict before passing to TransferData kwargs = {} perf_opts = dict((k, v) for (k, v) in dict( perf_cc=perf_cc, perf_p=perf_p, perf_pp=perf_pp, perf_udt=perf_udt).items() if v is not None) kwargs.update(perf_opts) kwargs.update(notify) client = get_client() transfer_data = TransferData(client, source_endpoint, dest_endpoint, label=label, sync_level=sync_level, verify_checksum=verify_checksum, preserve_timestamp=preserve_mtime, encrypt_data=encrypt, submission_id=submission_id, delete_destination_extra=delete, deadline=deadline, skip_activation_check=skip_activation_check, **kwargs) if batch: @click.command() @click.option("--external-checksum") @click.option("--recursive", "-r", is_flag=True) @click.argument("source_path", type=TaskPath(base_dir=cmd_source_path)) @click.argument("dest_path", type=TaskPath(base_dir=cmd_dest_path)) def process_batch_line(dest_path, source_path, recursive, external_checksum): """ Parse a line of batch input and turn it into a transfer submission item. """ if recursive and external_checksum: raise click.UsageError("--recursive and --external-checksum " "are mutually exclusive") transfer_data.add_item( str(source_path), str(dest_path), external_checksum=external_checksum, checksum_algorithm=checksum_algorithm, recursive=recursive, ) shlex_process_stdin( process_batch_line, ("Enter transfers, line by line, as\n\n" " [--recursive] [--external-checksum TEXT] SOURCE_PATH DEST_PATH\n" ), ) else: transfer_data.add_item( cmd_source_path, cmd_dest_path, external_checksum=external_checksum, checksum_algorithm=checksum_algorithm, recursive=recursive, ) if dry_run: formatted_print( transfer_data, response_key="DATA", fields=( ("Source Path", "source_path"), ("Dest Path", "destination_path"), ("Recursive", "recursive"), ("External Checksum", "external_checksum"), ), ) # exit safely return # autoactivate after parsing all args and putting things together # skip this if skip-activation-check is given if not skip_activation_check: autoactivate(client, source_endpoint, if_expires_in=60) autoactivate(client, dest_endpoint, if_expires_in=60) res = client.submit_transfer(transfer_data) formatted_print( res, text_format=FORMAT_TEXT_RECORD, fields=(("Message", "message"), ("Task ID", "task_id")), )
def transfer(self): # this isn't very scalable and there isn't much wisdom in wasting a thread on a transfer # that is directed by another machine, but we waste an entire process or more on the # gridftp server processes anyway, so this may not quite be the bottleneck self._maybeStartServer() userEndpointId = self.server.getUserEndpointId(self.user) tc = self.clients.getUserTransferClient(self.user) tmpName = str(uuid.uuid4()) transfer = TransferData(tc, self._getSourceEndpointId(), userEndpointId, label=str(self.transferId)) transfer['notify_on_succeeded'] = False transfer['notify_on_failed'] = False transfer['notify_on_inactive'] = False transfer.add_item(self._getSourcePath(), tmpName) res = tc.submit_transfer(transfer) if res['code'] != 'Accepted': raise Exception('Transfer submission failed: %s - %s' % (res.code, res.message)) taskId = res['task_id'] self._updateTransfer(tmpName, taskId) while True: task = tc.get_task(taskId) status = task['status'] if status == 'ACTIVE': # update bytes pass elif status == 'INACTIVE': # credential expiration # TODO: deal with this properly or ensure it does not happen msg = 'Credential expired for Globus task %s, transfer %s.' % ( taskId, self.transferId) logger.warn(msg) raise Exception(msg) elif status == 'SUCCEEDED': dir = os.path.dirname(self.psPath) try: os.makedirs(dir) except OSError: if not os.path.exists(dir): raise Exception( 'Could not create transfer destination directory: %s' % dir) shutil.move( '%s/%s' % (self.server.getUserDir(self.user), tmpName), self.psPath) return elif status == 'FAILED': if task['fatal_error']: raise Exception( 'Globus transfer %s failed: %s' % (self.transferId, task['fatal_error']['description'])) else: raise Exception( 'Globus transfer %s failed for unknown reasons' % self.transferId) else: raise Exception( 'Unknown globus task status %s for transfer %s' % (status, self.transferId)) time.sleep(10)
def trigger_stage_out(self, jobspec): # make logger tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_stage_out') tmpLog.debug('start') # default return tmpRetVal = (True, '') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format( jobspec.computingSite)) # test we have a Globus Transfer Client if not self.tc: errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # get label label = self.make_label(jobspec) tmpLog.debug('label={0}'.format(label)) # get transfer tasks tmpStat, transferTasks = globus_utils.get_transfer_tasks( tmpLog, self.tc, label) if not tmpStat: errStr = 'failed to get transfer tasks' tmpLog.error(errStr) return False, errStr # check if already queued if label in transferTasks: tmpLog.debug('skip since already queued with {0}'.format( str(transferTasks[label]))) return True, '' # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) #self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] self.srcEndpoint = queueConfig.stager['srcEndpoint'] self.Globus_srcPath = self.basePath self.Globus_dstPath = queueConfig.stager['Globus_dstPath'] self.dstEndpoint = queueConfig.stager['dstEndpoint'] # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errMsg = '' if not tmpStatsrc: errMsg += ' source Endpoint not activated ' if not tmpStatdst: errMsg += ' destination Endpoint not activated ' tmpLog.error(errMsg) tmpRetVal = (False, errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, label=label, sync_level="checksum") except: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files fileAttrs = jobspec.get_output_file_attributes() lfns = [] for fileSpec in jobspec.outFiles: scope = fileAttrs[fileSpec.lfn]['scope'] hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata if os.access(srcURL, os.R_OK): tmpLog.debug("tdata.add_item({},{})".format(srcURL, dstURL)) tdata.add_item(srcURL, dstURL) lfns.append(fileSpec.lfn) else: errMsg = "source file {} does not exist".format(srcURL) tmpLog.error(errMsg) tmpRetVal = (False, errMsg) return tmpRetVal # submit transfer try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] tmpLog.debug( 'successfully submitted id={0}'.format(transferID)) jobspec.set_groups_to_files( {transferID: { 'lfns': lfns, 'groupStatus': 'active' }}) # set for fileSpec in jobspec.outFiles: if fileSpec.fileAttributes == None: fileSpec.fileAttributes = {} fileSpec.fileAttributes['transferID'] = transferID else: tmpRetVal = (False, transfer_result['message']) except Exception as e: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) if errMsg is None: errtype, errvalue = sys.exc_info()[:2] errMsg = "{0} {1}".format(errtype.__name__, errvalue) tmpRetVal = (errStat, errMsg) # return tmpLog.debug('done') return tmpRetVal
def run_agent(): dataset_name = "hopv" local_ep = "" dest_ep = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec" dest_path = "/sample_data/" + dataset_name + "_train.csv" timeout = False timeout_intervals = 10 interval_time = 10 verbose = True search_client = globus_auth.login("https://search.api.globus.org/", "globus_search") transfer_client = transfer_auth.login() if not local_ep: pgr_res = transfer_client.endpoint_search(filter_scope="my-endpoints") ep_candidates = pgr_res.data if len(ep_candidates) < 1: #Nothing found raise GlobusError("Error: No local endpoints found") elif len(ep_candidates) == 1: #Exactly one candidate if ep_candidates[0]["gcp_connected"] == False: #Is GCP, is not on raise GlobusError("Error: Globus Connect is not running") else: #Is GCServer or GCP and connected local_ep = ep_candidates[0]["id"] else: # >1 found #Filter out disconnected GCP ep_connections = [ candidate for candidate in ep_candidates if candidate["gcp_connected"] is not False ] #Recheck list if len(ep_connections) < 1: #Nothing found raise GlobusError("Error: No local endpoints running") elif len(ep_connections) == 1: #Exactly one candidate if ep_connections[0][ "gcp_connected"] == False: #Is GCP, is not on raise GlobusError("Error: Globus Connect is not active") else: #Is GCServer or GCP and connected local_ep = ep_connections[0]["id"] else: # >1 found #Prompt user print("Multiple endpoints found:") count = 0 for ep in ep_connections: count += 1 print(count, ": ", ep["display_name"], "\t", ep["id"]) print("\nPlease choose the endpoint on this machine") ep_num = 0 while ep_num == 0: usr_choice = input( "Enter the number of the correct endpoint (-1 to cancel): " ) try: ep_choice = int(usr_choice) if ep_choice == -1: #User wants to quit ep_num = -1 #Will break out of while to exit program elif ep_choice in range(1, count + 1): #Valid selection ep_num = ep_choice #Break out of while, return valid ID else: #Invalid number print("Invalid selection") except: print("Invalid input") if ep_num == -1: print("Cancelling") sys.exit() local_ep = ep_connections[ep_num - 1]["id"] # Fetch and aggregate records into training set count = 0 num_processed = 0 data_list = [] while True: query = { "q": ("mdf_source_name:" + dataset_name + " AND mdf_node_type:record AND " "globus_scroll_id:(>=" + str(count) + " AND <" + str(count + 10000) + ")"), "advanced": True, "limit": 10000 } raw_res = search_client.structured_search(query) search_res = gmeta_pop(raw_res, True) for res in search_res: data_dict = json.loads(res["data"]["hopv-experimental_data"]) data_list.append(data_dict) num_ret = len(search_res) if num_ret: num_processed += num_ret count += 10000 else: break if verbose: print("Processed:", len(data_list), "/", num_processed, "|", len(data_list) - num_processed) df = pd.DataFrame(data_list) df.to_csv(os.path.join(os.getcwd(), "temp_train.csv")) # Upload to NCSA endpoint try: tdata = TransferData(transfer_client, local_ep, dest_ep, verify_checksum=True, notify_on_succeeded=False, notify_on_failed=False, notify_on_inactive=False) tdata.add_item(os.path.join(os.getcwd(), "temp_train.csv"), dest_path) res = transfer_client.submit_transfer(tdata) if res["code"] != "Accepted": raise GlobusError("Failed to transfer files: Transfer " + res["code"]) else: intervals = 0 while not transfer_client.task_wait( res["task_id"], timeout=interval_time, polling_interval=interval_time): for event in transfer_client.task_event_list(res["task_id"]): if event["is_error"]: transfer_client.cancel_task(res["task_id"]) raise GlobusError("Error: " + event["description"]) if timeout and intervals >= timeout_intervals: transfer_client.cancel_task(res["task_id"]) raise GlobusError("Transfer timed out.") intervals += 1 except Exception as e: raise finally: os.remove(os.path.join(os.getcwd(), "temp_train.csv")) # Update dataset entry query = { "q": "mdf_source_name:" + dataset_name + " AND mdf_node_type:dataset", "advanced": True } raw_res = search_client.structured_search(query) search_res = gmeta_pop(raw_res) if len(search_res) != 1: raise ValueError("Incorrect number of results: " + str(len(search_res))) ingest = search_res[0] ingest["globus_subject"] = raw_res["gmeta"][0]["subject"] ingest["acl"] = ["public"] ingest["http://materialsdatafacility.org/#training_set"] = { "http://materialsdatafacility.org/#endpoint": dest_ep, "http://materialsdatafacility.org/#path": dest_path, "http://materialsdatafacility.org/#https": "https://data.materialsdatafacility.org" + dest_path } gmeta = format_gmeta([format_gmeta(ingest)]) gmeta = json.loads( json.dumps(gmeta).replace( "mdf-publish.publication.community", "http://globus.org/publish-terms/#publication/community")) search_client.ingest(gmeta) # Check ingest query = { "q": "mdf_source_name:" + dataset_name + " AND mdf_node_type:dataset", "advanced": True } raw_res = search_client.structured_search(query) search_res = gmeta_pop(raw_res, True) if verbose: print( "Verification:\n", json.dumps(search_res[0]["training_set"], sort_keys=True, indent=4, separators=(',', ': ')))
tmpStatdst, dstStr = globus_utils.check_endpoint_activation( tmpLog, tc, srcEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errStr = '' if not tmpStatsrc: errStr += ' source Endpoint not activated ' if not tmpStatdst: errStr += ' destination Endpoint not activated ' tmpLog.error(errStr) sys.exit(2) # We are sending test files from our destination machine to the source machine # both endpoints activated now prepare to transfer data tdata = TransferData(tc, dstEndpoint, srcEndpoint, sync_level="checksum") except: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) sys.exit(1) # create JobSpec jobSpec = JobSpec() jobSpec.jobParams = { 'scopeLog': 'panda', 'logFile': 'log', } jobSpec.computingSite = queueName jobSpec.PandaID = job_id jobSpec.modificationTime = datetime.datetime.now() realDataset = 'panda.sgotest.' + uuid.uuid4().hex ddmEndPointIn = 'BNL-OSG2_DATADISK'
def execute(self, event): """ Start the transfer Parameters: event (thread.event): event to trigger job cancel """ # reject if job isnt valid self.prevalidate() if self.status != JobStatus.VALID: logging.error('Transfer job in invalid state') logging.error(str(self)) return if not check_logged_in(): self.status = JobStatus.INVALID logging.error('Transfer failed, not logged into globus') return self.start_time = datetime.now() # Get source and destination UUIDs srcendpoint = self.config.get('source_endpoint') dstendpoint = self.config.get('destination_endpoint') message = 'Starting setup for transfer job from {src} to {dst}'.format( src=srcendpoint, dst=dstendpoint) logging.info(message) # Log into globus and activate endpoints endpoints = [srcendpoint, dstendpoint] setup_globus(endpoints=endpoints, event_list=self.event_list, no_ui=not self.config.get('ui', True), src=self.config.get('source_email'), dst=self.config.get('source_email'), display_event=self.config.get('display_event')) client = get_client() # task_label = "{start} to {end}".format( # start=self.file_list[0]['name'], # end=self.file_list[-1]['name']) task_label = 'Autotransfer of {number} files at {time}'.format( number=len(self.file_list), time=time.strftime("%I-%M")) try: transfer_task = TransferData(client, srcendpoint, dstendpoint, sync_level='checksum', label=task_label) except Exception as e: logging.error('Error creating transfer task') logging.error(format_debug(e)) self.status = JobStatus.FAILED return if not self.config['file_list']: logging.error('Unable to transfer files without a source list') self.status = JobStatus.FAILED return for datafile in self.config['file_list']: transfer_task.add_item(source_path=datafile['remote_path'], destination_path=datafile['local_path'], recursive=False) # Start the transfer task_id = None result = None try: result = client.submit_transfer(transfer_task) task_id = result["task_id"] logging.info('starting transfer with task id %s', task_id) except Exception as e: if result: logging.error("result: %s", str(result)) logging.error("Could not submit the transfer") logging.error(format_debug(e)) self.status = JobStatus.FAILED return # Check a status of the transfer every 10 secs number_transfered = -1 while True: try: while True: try: status = client.get_task(task_id) except: time.sleep(1) else: break if status['status'] == 'SUCCEEDED': logging.info('progress %d/%d', status['files_transferred'], status['files']) percent_complete = 100.0 self.display_status( percent_complete=percent_complete, task_id=task_id, num_completed=int(status['files_transferred']) + int(status['files_skipped']), num_total=status['files']) message = 'Transfer job completed' self.status = JobStatus.COMPLETED return elif status['status'] == 'FAILED': logging.error('Error transfering files %s', status.get('nice_status_details')) self.status = JobStatus.FAILED return elif status['status'] == 'ACTIVE': if number_transfered < status['files_transferred']: number_transfered = status['files_transferred'] logging.info('progress %d/%d', status['files_transferred'], status['files']) percent_complete = ( float(status['files_transferred'] + float(status['files_skipped'])) / float(status['files'])) * 100 self.display_status( percent_complete=percent_complete, task_id=task_id, num_completed=int(status['files_transferred']) + int(status['files_skipped']), num_total=status['files']) self.status = JobStatus.RUNNING if event and event.is_set(): client.cancel_task(task_id) # self.error_cleanup() return except Exception as e: logging.error(format_debug(e)) client.cancel_task(task_id) # self.error_cleanup() return time.sleep(5)
def validate(): params = request.json crawl_id = params["crawl_id"] globus_eid = params["globus_eid"] transfer_token = params["transfer_token"] source_destination = params["source_destination"] dataset_info = params["dataset_info"] # To be implemented later client = boto3.client('sqs', aws_access_key_id=os.environ["aws_access"], aws_secret_access_key=os.environ["aws_secret"], region_name='us-east-1') try: response = client.get_queue_url( QueueName=f'validate_{crawl_id}', QueueOwnerAWSAccountId=os.environ["aws_account_id"]) except: # Add SQS.Client.exceptions.QueueDoesNotExist error abort(400, "Invalid crawl ID") try: authorizer = AccessTokenAuthorizer(transfer_token) tc = TransferClient(authorizer=authorizer) except: # Add exception abort(400, "Invalid transfer token") crawl_queue = response["QueueUrl"] date = datetime.datetime.now() file_name = date.strftime("%m_%d_%Y-%H_%M_%S") + ".txt" try: with open(file_name, "w") as f: while True: sqs_response = client.receive_message( QueueUrl=crawl_queue, MaxNumberOfMessages=1, # To be toggled WaitTimeSeconds=1) if "Messages" not in sqs_response: # xtract_status = requests.get(f"{eb_url}/get_extract_status", json={"crawl_id": crawl_id}) # print("HERE") # print(xtract_status.content) # xtract_content = json.loads(xtract_status.content) # # print(xtract_content) # # if xtract_content["IDLE"] == 0 and xtract_content["PENDING"] == 0: break del_list = [] for message in sqs_response["Messages"]: message_body = message["Body"] # PROCESS MESSAGE_BODY f.write(message_body) # print(message_body) del_list.append({ 'ReceiptHandle': message["ReceiptHandle"], 'Id': message["MessageId"] }) if len(del_list) > 0: client.delete_message_batch(QueueUrl=crawl_queue, Entries=del_list) tdata = TransferData( tc, "5ecf6444-affc-11e9-98d4-0a63aa6b37da", #TODO: Add source endpoint globus_eid, label=f"{crawl_id}") tdata.add_item(os.path.abspath(file_name), os.path.join(source_destination, file_name)) tc.endpoint_autoactivate( "5ecf6444-affc-11e9-98d4-0a63aa6b37da") #TODO: Add source endpoint tc.endpoint_autoactivate(globus_eid) submit_result = tc.submit_transfer(tdata) while True: result = tc.get_task(submit_result['task_id']) if result.data["status"] == "SUCCEEDED": break elif result.data["status"] == "FAILED": raise RuntimeError # TODO: Change this else: time.sleep(0.5) except Exception as e: print(e) abort(400, "Failed to validate") finally: os.remove(file_name) return "[200] Submitted"
on_refresh=update_tokens_file_on_refresh) try: tc = TransferClient(authorizer=authorizer) except: print( "ERROR: TransferClient() call failed! Unable to call the Globus transfer interface with the provided auth info!" ) sys.exit(-1) # print(transfer) # Now we should have auth, try setting up a transfer. tdata = TransferData(tc, source_endpoint_id, destination_endpoint_id, label="DCDE Relion transfer", sync_level="size") tdata.add_item(source_dir, dest_dir, recursive=True) transfer_result = tc.submit_transfer(tdata) print("task_id =", transfer_result["task_id"]) while not tc.task_wait( transfer_result['task_id'], timeout=1200, polling_interval=10): print(".", end="") print("\n{} completed!".format(transfer_result['task_id'])) os.listdir(path=dest_dir)
def check_status(self, jobspec): # make logger tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='check_status') tmpLog.debug('start') # default return tmpRetVal = (True, '') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format( jobspec.computingSite)) # test we have a Globus Transfer Client if not self.tc: errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # set transferID to None transferID = None # get transfer groups groups = jobspec.get_groups_of_output_files() tmpLog.debug( 'jobspec.get_groups_of_output_files() = : {0}'.format(groups)) # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests if self.dummy_transfer_id in groups: # lock for 120 sec if not self.have_db_lock: tmpLog.debug( 'attempt to set DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.get_object_lock( self.dummy_transfer_id, lock_interval=120) if not self.have_db_lock: # escape since locked by another thread msgStr = 'escape since locked by another thread' tmpLog.debug(msgStr) return None, msgStr # refresh group information since that could have been updated by another thread before getting the lock self.dbInterface.refresh_file_group_info(jobspec) # get transfer groups again with refreshed info groups = jobspec.get_groups_of_output_files() # the dummy transfer ID is still there if self.dummy_transfer_id in groups: groupUpdateTime = groups[ self.dummy_transfer_id]['groupUpdateTime'] # get files with the dummy transfer ID across jobs fileSpecs = self.dbInterface.get_files_with_group_id( self.dummy_transfer_id) # submit transfer if there are more than 10 files or the group was made before more than 10 min msgStr = 'self.dummy_transfer_id = {0} number of files = {1}'.format( self.dummy_transfer_id, len(fileSpecs)) tmpLog.debug(msgStr) if len(fileSpecs) >= 10 or \ groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): tmpLog.debug('prepare to transfer files') # submit transfer and get a real transfer ID # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue( jobspec.computingSite) #self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] self.srcEndpoint = queueConfig.stager['srcEndpoint'] self.Globus_srcPath = self.basePath self.Globus_dstPath = queueConfig.stager['Globus_dstPath'] self.dstEndpoint = queueConfig.stager['dstEndpoint'] # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errMsg = '' if not tmpStatsrc: errMsg += ' source Endpoint not activated ' if not tmpStatdst: errMsg += ' destination Endpoint not activated ' # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock( self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format( self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (None, errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, sync_level="checksum") except: errStat, errMsg = globus_utils.handle_globus_exception( tmpLog) # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock( self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format( self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files for fileSpec in fileSpecs: attrs = jobspec.get_output_file_attributes() msgStr = "len(jobSpec.get_output_file_attributes()) = {0} type - {1}".format( len(attrs), type(attrs)) tmpLog.debug(msgStr) for key, value in attrs.iteritems(): msgStr = "output file attributes - {0} {1}".format( key, value) tmpLog.debug(msgStr) msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format( fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) scope = fileSpec.scope hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) tmpLog.debug('src={srcURL} dst={dstURL}'.format( srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata if os.access(srcURL, os.R_OK): tmpLog.debug("tdata.add_item({},{})".format( srcURL, dstURL)) tdata.add_item(srcURL, dstURL) else: errMsg = "source file {} does not exist".format( srcURL) # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock( self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format( self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (False, errMsg) return tmpRetVal # submit transfer try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] tmpLog.debug( 'successfully submitted id={0}'.format( transferID)) # set status for files self.dbInterface.set_file_group( fileSpecs, transferID, 'running') msgStr = 'submitted transfer with ID={0}'.format( transferID) tmpLog.debug(msgStr) else: # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock( self.dummy_transfer_id) if not self.have_db_lock: errMsg = 'Could not release DB lock for {}'.format( self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (None, transfer_result['message']) return tmpRetVal except Exception as e: errStat, errMsg = globus_utils.handle_globus_exception( tmpLog) # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock( self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format( self.dummy_transfer_id) tmpLog.error(errMsg) return errStat, errMsg else: msgStr = 'wait until enough files are pooled' tmpLog.debug(msgStr) # release the lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock( self.dummy_transfer_id) if not self.have_db_lock: msgStr += ' - Could not release DB lock for {}'.format( self.dummy_transfer_id) tmpLog.error(msgStr) # return None to retry later return None, msgStr # check transfer with real transfer IDs # get transfer groups groups = jobspec.get_groups_of_output_files() for transferID in groups: if transferID != self.dummy_transfer_id: # get transfer task tmpStat, transferTasks = globus_utils.get_transfer_task_by_id( tmpLog, self.tc, transferID) # return a temporary error when failed to get task if not tmpStat: errStr = 'failed to get transfer task' tmpLog.error(errStr) return None, errStr # return a temporary error when task is missing if transferID not in transferTasks: errStr = 'transfer task ID - {} is missing'.format( transferID) tmpLog.error(errStr) return None, errStr # succeeded in finding a transfer task by tranferID if transferTasks[transferID]['status'] == 'SUCCEEDED': tmpLog.debug( 'transfer task {} succeeded'.format(transferID)) self.set_FileSpec_status(jobspec, 'finished') return True, '' # failed if transferTasks[transferID]['status'] == 'FAILED': errStr = 'transfer task {} failed'.format(transferID) tmpLog.error(errStr) self.set_FileSpec_status(jobspec, 'failed') return False, errStr # another status tmpStr = 'transfer task {0} status: {1}'.format( transferID, transferTasks[transferID]['status']) tmpLog.debug(tmpStr) return None, ''
def trigger_preparation(self, jobspec): # get logger tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_preparation') tmpLog.debug('start') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format( jobspec.computingSite)) # test we have a Globus Transfer Client if not self.tc: errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # get label label = self.make_label(jobspec) tmpLog.debug('label={0}'.format(label)) # get transfer tasks tmpStat, transferTasks = globus_utils.get_transfer_tasks( tmpLog, self.tc, label) if not tmpStat: errStr = 'failed to get transfer tasks' tmpLog.error(errStr) return False, errStr # check if already queued if label in transferTasks: tmpLog.debug('skip since already queued with {0}'.format( str(transferTasks[label]))) return True, '' # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) self.Globus_srcPath = queueConfig.preparator['Globus_srcPath'] self.srcEndpoint = queueConfig.preparator['srcEndpoint'] self.Globus_dstPath = self.basePath #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath'] self.dstEndpoint = queueConfig.preparator['dstEndpoint'] # get input files files = [] lfns = [] inFiles = jobspec.get_input_file_attributes(skip_ready=True) for inLFN, inFile in iteritems(inFiles): # set path to each file inFile['path'] = mover_utils.construct_file_path( self.basePath, inFile['scope'], inLFN) dstpath = inFile['path'] # check if path exists if not create it. if not os.access(self.basePath, os.F_OK): os.makedirs(self.basePath) # create the file paths for the Globus source and destination endpoints Globus_srcpath = mover_utils.construct_file_path( self.Globus_srcPath, inFile['scope'], inLFN) Globus_dstpath = mover_utils.construct_file_path( self.Globus_dstPath, inFile['scope'], inLFN) files.append({ 'scope': inFile['scope'], 'name': inLFN, 'Globus_dstPath': Globus_dstpath, 'Globus_srcPath': Globus_srcpath }) lfns.append(inLFN) tmpLog.debug('files[] {0}'.format(files)) try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errStr = '' if not tmpStatsrc: errStr += ' source Endpoint not activated ' if not tmpStatdst: errStr += ' destination Endpoint not activated ' tmpLog.error(errStr) return False, errStr # both endpoints activated now prepare to transfer data if len(files) > 0: tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, label=label, sync_level="checksum") # loop over all input files and add for myfile in files: tdata.add_item(myfile['Globus_srcPath'], myfile['Globus_dstPath']) # submit transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] jobspec.set_groups_to_files( {transferID: { 'lfns': lfns, 'groupStatus': 'active' }}) tmpLog.debug('done') return True, '' else: return False, transfer_result['message'] # if no files to transfer return True return True, 'No files to transfer' except: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) return errStat, {}
import uuid from globus_sdk import TransferClient, TransferData # simple usage, ok tc = TransferClient() TransferData(tc, "srcep", "destep") # can set sync level TransferData(tc, "srcep", "destep", sync_level=1) TransferData(tc, "srcep", "destep", sync_level="exists") # unknown int values are allowed TransferData(tc, "srcep", "destep", sync_level=100) # unknown str values are rejected (Literal) TransferData(tc, "srcep", "destep", sync_level="sizes") # type: ignore[arg-type] # TransferData.add_filter_rule tdata = TransferData(tc, uuid.UUID(), uuid.UUID()) tdata.add_filter_rule("*.tgz") tdata.add_filter_rule("*.tgz", method="exclude") tdata.add_filter_rule("*.tgz", type="file") # bad values rejected (Literal) tdata.add_filter_rule("*.tgz", type="files") # type: ignore[arg-type] tdata.add_filter_rule("*.tgz", method="include") # type: ignore[arg-type]
def check_stage_in_status(self, jobspec): # make logger tmpLog = self.make_logger(_logger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), method_name='check_stage_in_status') tmpLog.debug('start') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) # show the dummy transfer id and set to a value with the jobspec.computingSite if needed. tmpLog.debug('self.dummy_transfer_id = {}'.format(self.dummy_transfer_id)) if self.dummy_transfer_id == '{0}_{1}'.format(dummy_transfer_id_base,'XXXX') : old_dummy_transfer_id = self.dummy_transfer_id self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base,jobspec.computingSite) tmpLog.debug('Change self.dummy_transfer_id from {0} to {1}'.format(old_dummy_transfer_id,self.dummy_transfer_id)) # default return tmpRetVal = (True, '') # set flag if have db lock have_db_lock = False queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # test we have a Globus Transfer Client if not self.tc : errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # set transferID to None transferID = None # get transfer groups groups = jobspec.get_groups_of_input_files(skip_ready=True) tmpLog.debug('jobspec.get_groups_of_input_files() = : {0}'.format(groups)) # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests for dummy_transferID in groups: # skip if valid transfer ID not dummy one if validate_transferid(dummy_transferID) : continue # lock for 120 sec tmpLog.debug('attempt to set DB lock for self.id - {0} self.dummy_transfer_id - {1}, dummy_transferID - {2}'.format(self.id,self.dummy_transfer_id,dummy_transferID)) have_db_lock = self.dbInterface.get_object_lock(dummy_transferID, lock_interval=120) tmpLog.debug(' DB lock result - {0}'.format(have_db_lock)) if not have_db_lock: # escape since locked by another thread msgStr = 'escape since locked by another thread' tmpLog.debug(msgStr) return None, msgStr # refresh group information since that could have been updated by another thread before getting the lock tmpLog.debug('self.dbInterface.refresh_file_group_info(jobspec)') self.dbInterface.refresh_file_group_info(jobspec) tmpLog.debug('after self.dbInterface.refresh_file_group_info(jobspec)') # get transfer groups again with refreshed info tmpLog.debug('groups = jobspec.get_groups_of_input_files(skip_ready=True)') groups = jobspec.get_groups_of_input_files(skip_ready=True) tmpLog.debug('after db lock and refresh - jobspec.get_groups_of_input_files(skip_ready=True) = : {0}'.format(groups)) # the dummy transfer ID is still there if dummy_transferID in groups: groupUpdateTime = groups[dummy_transferID]['groupUpdateTime'] # get files with the dummy transfer ID across jobs fileSpecs_allgroups = self.dbInterface.get_files_with_group_id(dummy_transferID) msgStr = 'dummy_transferID = {0} self.dbInterface.get_files_with_group_id(dummy_transferID) number of files = {1}'.format(dummy_transferID,len(fileSpecs_allgroups)) tmpLog.debug(msgStr) fileSpecs = jobspec.get_input_file_specs(dummy_transferID, skip_ready=True) msgStr = 'dummy_transferID = {0} jobspec.get_input_file_specs(dummy_transferID,skip_ready=True) number of files = {1}'.format(dummy_transferID,len(fileSpecs)) tmpLog.debug(msgStr) # submit transfer if there are more than 10 files or the group was made before more than 10 min if len(fileSpecs) >= 10 or \ groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): tmpLog.debug('prepare to transfer files') # submit transfer and get a real transfer ID # set the Globus destination Endpoint id and path will get them from Agis eventually self.Globus_srcPath = queueConfig.preparator['Globus_srcPath'] self.srcEndpoint = queueConfig.preparator['srcEndpoint'] self.Globus_dstPath = self.basePath #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath'] self.dstEndpoint = queueConfig.preparator['dstEndpoint'] # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errMsg = '' if not tmpStatsrc : errMsg += ' source Endpoint not activated ' if not tmpStatdst : errMsg += ' destination Endpoint not activated ' # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}, dummy_transferID - {2}'.format(self.id,self.dummy_transfer_id,dummy_transferID)) have_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not have_db_lock: errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (None,errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = None tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, sync_level="exists") # sync_level="checksum") tmpLog.debug('size of tdata[DATA] - {}'.format(len(tdata['DATA']))) except: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}, dummy_transferID - {2}'.format(self.id,self.dummy_transfer_id,dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not release_db_lock: errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files ifile = 0 for fileSpec in fileSpecs: # only print to log file first 25 files if ifile < 25 : msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) if ifile == 25 : msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) # end debug log file test scope = 'panda' if fileSpec.scope is not None : scope = fileSpec.scope hash = hashlib.md5() if sys.version_info.major == 2: hash.update('%s:%s' % (scope, fileSpec.lfn)) if sys.version_info.major == 3: hash_string = "{0}:{1}".format(scope, fileSpec.lfn) hash.update(bytes(hash_string, 'utf-8')) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) #srcURL = fileSpec.path srcURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_srcPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) # add files to transfer object - tdata if ifile < 25 : tmpLog.debug("tdata.add_item({},{})".format(srcURL,dstURL)) tdata.add_item(srcURL,dstURL) ifile += 1 # submit transfer tmpLog.debug('Number of files to transfer - {}'.format(len(tdata['DATA']))) try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] tmpLog.debug('successfully submitted id={0}'.format(transferID)) # set status for files self.dbInterface.set_file_group(fileSpecs, transferID, 'running') msgStr = 'submitted transfer with ID={0}'.format(transferID) tmpLog.debug(msgStr) else: # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if release_db_lock: tmpLog.debug('Released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) have_db_lock = False else: errMsg = 'Could not release DB lock for {}'.format(dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (None, transfer_result['message']) return tmpRetVal except Exception as e: errStat,errMsg = globus_utils.handle_globus_exception(tmpLog) # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if release_db_lock: tmpLog.debug('Released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) have_db_lock = False else : errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID) tmpLog.error(errMsg) return errStat, errMsg else: msgStr = 'wait until enough files are pooled' tmpLog.debug(msgStr) # release the lock tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if release_db_lock: tmpLog.debug('released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) have_db_lock = False else: msgStr += ' - Could not release DB lock for {}'.format(dummy_transferID) tmpLog.error(msgStr) # return None to retry later return None, msgStr # release the db lock if needed if have_db_lock: tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if release_db_lock: tmpLog.debug('released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) have_db_lock = False else: msgStr += ' - Could not release DB lock for {}'.format(dummy_transferID) tmpLog.error(msgStr) return None, msgStr # check transfer with real transfer IDs # get transfer groups tmpLog.debug("groups = jobspec.get_groups_of_input_files(skip_ready=True)") groups = jobspec.get_groups_of_input_files(skip_ready=True) tmpLog.debug('Number of transfer groups (skip_ready)- {0}'.format(len(groups))) tmpLog.debug('transfer groups any state (skip_ready)- {0}'.format(groups)) tmpLog.debug("groups = jobspec.get_groups_of_input_files()") groups = jobspec.get_groups_of_input_files() tmpLog.debug('Number of transfer groups - {0}'.format(len(groups))) tmpLog.debug('transfer groups any state - {0}'.format(groups)) tmpLog.debug("groups = jobspec.get_groups_of_input_files(skip_ready=True)") groups = jobspec.get_groups_of_input_files(skip_ready=True) if len(groups) == 0: tmpLog.debug("jobspec.get_groups_of_input_files(skip_ready=True) returned no files ") tmpLog.debug("check_stage_in_status return status - True ") return True,'' for transferID in groups: # allow only valid UUID if validate_transferid(transferID) : # get transfer task tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog,self.tc,transferID) # return a temporary error when failed to get task if not tmpStat: errStr = 'failed to get transfer task; tc = %s; transferID = %s' % (str(self.tc),str(transferID)) tmpLog.error(errStr) return None, errStr # return a temporary error when task is missing if transferID not in transferTasks: errStr = 'transfer task ID - {} is missing'.format(transferID) tmpLog.error(errStr) return None, errStr # succeeded in finding a transfer task by tranferID if transferTasks[transferID]['status'] == 'SUCCEEDED': tmpLog.debug('transfer task {} succeeded'.format(transferID)) self.set_FileSpec_status(jobspec,'finished') return True, '' # failed if transferTasks[transferID]['status'] == 'FAILED': errStr = 'transfer task {} failed'.format(transferID) tmpLog.error(errStr) self.set_FileSpec_status(jobspec,'failed') return False, errStr # another status tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status']) tmpLog.debug(tmpStr) return None, tmpStr # end of loop over transfer groups tmpLog.debug('End of loop over transfers groups - ending check_stage_in_status function') return None,'no valid transfer id found'
def transfer_command( batch, sync_level, recursive, destination, source, checksum_algorithm, external_checksum, label, preserve_mtime, verify_checksum, encrypt, submission_id, dry_run, delete, deadline, skip_activation_check, notify, perf_cc, perf_p, perf_pp, perf_udt, ): """ Copy a file or directory from one endpoint to another as an asynchronous task. 'globus transfer' has two modes. Single target, which transfers one file or one directory, and batch, which takes in several lines to transfer multiple files or directories. See "Batch Input" below for more information. 'globus transfer' will always place the dest files in a consistent, deterministic location. The contents of a source directory will be placed inside the dest directory. A source file will be copied to the dest file path, which must not be an existing directory. All intermediate / parent directories on the dest will be automatically created if they don't exist. If the files or directories given as input are symbolic links, they are followed. However, no other symbolic links are followed and no symbolic links are ever created on the dest. \b === Batched Input If you use `SOURCE_PATH` and `DEST_PATH` without the `--batch` flag, you will submit a single-file or single-directory transfer task. This has behavior similar to `cp` and `cp -r` across endpoints. Using `--batch`, `globus transfer` can submit a task which transfers multiple files or directories. Paths to transfer are taken from stdin. Lines are split on spaces, respecting quotes, and every line is treated as a file or directory to transfer. \b Lines are of the form [--recursive] [--external-checksum TEXT] SOURCE_PATH DEST_PATH\n Skips empty lines and allows comments beginning with "#". \b If you use `--batch` and a commandline SOURCE_PATH and/or DEST_PATH, these paths will be used as dir prefixes to any paths on stdin. \b === Sync Levels Sync Levels are ways to decide whether or not files are copied, with the following definitions: EXISTS: Determine whether or not to transfer based on file existence. If the destination file is absent, do the transfer. SIZE: Determine whether or not to transfer based on the size of the file. If destination file size does not match the source, do the transfer. MTIME: Determine whether or not to transfer based on modification times. If source has a newer modififed time than the destination, do the transfer. CHECKSUM: Determine whether or not to transfer based on checksums of file contents. If source and destination contents differ, as determined by a checksum of their contents, do the transfer. If a transfer fails, CHECKSUM must be used to restart the transfer. All other levels can lead to data corruption. {AUTOMATIC_ACTIVATION} """ source_endpoint, cmd_source_path = source dest_endpoint, cmd_dest_path = destination if recursive and batch: raise click.UsageError( ( "You cannot use --recursive in addition to --batch. " "Instead, use --recursive on lines of --batch input " "which need it" ) ) if external_checksum and batch: raise click.UsageError( ( "You cannot use --external-checksum in addition to --batch. " "Instead, use --external-checksum on lines of --batch input " "which need it" ) ) if recursive and external_checksum: raise click.UsageError( "--recursive and --external-checksum are mutually exclusive" ) if (cmd_source_path is None or cmd_dest_path is None) and (not batch): raise click.UsageError( "transfer requires either SOURCE_PATH and DEST_PATH or --batch" ) # because python can't handle multiple **kwargs expansions in a single # call, we need to get a little bit clever # both the performance options (of which there are a few), and the # notification options (also there are a few) have elements which should be # omitted in some cases # notify comes to us clean, perf opts need more care # put them together into a dict before passing to TransferData kwargs = {} perf_opts = dict( (k, v) for (k, v) in dict( perf_cc=perf_cc, perf_p=perf_p, perf_pp=perf_pp, perf_udt=perf_udt ).items() if v is not None ) kwargs.update(perf_opts) kwargs.update(notify) client = get_client() transfer_data = TransferData( client, source_endpoint, dest_endpoint, label=label, sync_level=sync_level, verify_checksum=verify_checksum, preserve_timestamp=preserve_mtime, encrypt_data=encrypt, submission_id=submission_id, delete_destination_extra=delete, deadline=deadline, skip_activation_check=skip_activation_check, **kwargs ) if batch: @click.command() @click.option("--external-checksum") @click.option("--recursive", "-r", is_flag=True) @click.argument("source_path", type=TaskPath(base_dir=cmd_source_path)) @click.argument("dest_path", type=TaskPath(base_dir=cmd_dest_path)) def process_batch_line(dest_path, source_path, recursive, external_checksum): """ Parse a line of batch input and turn it into a transfer submission item. """ if recursive and external_checksum: raise click.UsageError( "--recursive and --external-checksum are mutually exclusive" ) transfer_data.add_item( str(source_path), str(dest_path), external_checksum=external_checksum, checksum_algorithm=checksum_algorithm, recursive=recursive, ) shlex_process_stdin( process_batch_line, ( "Enter transfers, line by line, as\n\n" " [--recursive] [--external-checksum TEXT] SOURCE_PATH DEST_PATH\n" ), ) else: transfer_data.add_item( cmd_source_path, cmd_dest_path, external_checksum=external_checksum, checksum_algorithm=checksum_algorithm, recursive=recursive, ) if dry_run: formatted_print( transfer_data, response_key="DATA", fields=( ("Source Path", "source_path"), ("Dest Path", "destination_path"), ("Recursive", "recursive"), ("External Checksum", "external_checksum"), ), ) # exit safely return # autoactivate after parsing all args and putting things together # skip this if skip-activation-check is given if not skip_activation_check: autoactivate(client, source_endpoint, if_expires_in=60) autoactivate(client, dest_endpoint, if_expires_in=60) res = client.submit_transfer(transfer_data) formatted_print( res, text_format=FORMAT_TEXT_RECORD, fields=(("Message", "message"), ("Task ID", "task_id")), )