def __getTransferClient(self):
     self.transferClient = globus_sdk.TransferClient(authorizer=self.authorizer)
auth_code = input('Please enter the code you get after login here: ').strip()
token_response = client.oauth2_exchange_code_for_tokens(auth_code)

# the useful values that you want at the end of this
globus_auth_data = token_response.by_resource_server['auth.globus.org']
globus_transfer_data = token_response.by_resource_server[
    'transfer.api.globus.org']
globus_auth_token = globus_auth_data['access_token']
globus_transfer_token = globus_transfer_data['access_token']

# a GlobusAuthorizer is an auxiliary object we use to wrap the token. In
# more advanced scenarios, other types of GlobusAuthorizers give us
# expressive power
authorizer = globus_sdk.AccessTokenAuthorizer(globus_transfer_token)
tc = globus_sdk.TransferClient(authorizer=authorizer)

terraref = tc.get_endpoint(parser.get("globus", "terraref_endpoint"))
workbench = tc.get_endpoint(parser.get("globus", "workbench_endpoint"))

print("Terraref Endpoint name:", terraref["display_name"]
      or terraref["canonical_name"])

for dir in working_dirs:
    for date in dates:
        dir_to_transfer = "/".join([root_dir, dir, date])
        print(dir_to_transfer)

        tdata = globus_sdk.TransferData(tc,
                                        source_endpoint=terraref["id"],
                                        destination_endpoint=workbench["id"],
Exemplo n.º 3
0
def login(credentials=None, clear_old_tokens=False, **kwargs):
    """Login to Globus services

    Arguments:
    credentials (str or dict): A string filename, string JSON, or dictionary with credential and config information.
                               By default, looks in ~/mdf/credentials/globus_login.json.
        Contains:
        app_name (str): Name of script/client. This will form the name of the token cache file.
        services (list of str): Services to authenticate with (can be transfer, search, search_ingest, or mdf).
        index: The default Search index. Only required if services contains 'search' or 'search_ingest'.
    clear_old_tokens (bool): If True, delete old token file if it exists, forcing user to re-login.
                             If False, use existing token file if there is one.
                             Default False.

    Returns:
    dict: The clients and authorizers requested, indexed by service name.
          For example, if login() is told to auth with 'search' then the search client will be in the 'search' field.
    """
    NATIVE_CLIENT_ID = "98bfc684-977f-4670-8669-71f8337688e4"
    DEFAULT_CRED_FILENAME = "globus_login.json"
    DEFAULT_CRED_PATH = os.path.expanduser("~/mdf/credentials")
    SCOPES = {
        "transfer": "urn:globus:auth:scope:transfer.api.globus.org:all",
        "search": "urn:globus:auth:scope:search.api.globus.org:search",
        "search_ingest": "urn:globus:auth:scope:search.api.globus.org:all",
        "mdf":
        "urn:globus:auth:scope:data.materialsdatafacility.org:all"  # urn:globus:auth:scope:api.materialsdatafacility.org:all"
    }

    def _get_tokens(client, scopes, app_name, force_refresh=False):
        token_path = os.path.join(DEFAULT_CRED_PATH, app_name + "_tokens.json")
        if force_refresh:
            if os.path.exists(token_path):
                os.remove(token_path)
        if os.path.exists(token_path):
            with open(token_path, "r") as tf:
                tokens = json.load(tf)
        else:
            os.makedirs(DEFAULT_CRED_PATH, exist_ok=True)
            client.oauth2_start_flow(requested_scopes=scopes,
                                     refresh_tokens=True)
            authorize_url = client.oauth2_get_authorize_url()

            print(
                "It looks like this is the first time you're accessing this client.\nPlease log in to Globus at this link:\n",
                authorize_url)
            auth_code = input(
                "Copy and paste the authorization code here: ").strip()
            print("Thanks!")

            token_response = client.oauth2_exchange_code_for_tokens(auth_code)
            tokens = token_response.by_resource_server

            os.umask(0o077)
            with open(token_path, "w") as tf:
                json.dump(tokens, tf)

        return tokens

    if type(credentials) is str:
        try:
            with open(credentials) as cred_file:
                creds = json.load(cred_file)
        except IOError:
            try:
                creds = json.loads(credentials)
            except json.JSONDecodeError:
                raise ValueError("Credential string unreadable")
    elif type(credentials) is dict:
        creds = credentials
    else:
        try:
            with open(os.path.join(os.getcwd(),
                                   DEFAULT_CRED_FILENAME)) as cred_file:
                creds = json.load(cred_file)
        except IOError:
            try:
                with open(
                        os.path.join(DEFAULT_CRED_PATH,
                                     DEFAULT_CRED_FILENAME)) as cred_file:
                    creds = json.load(cred_file)
            except IOError:
                raise ValueError(
                    "Credentials/configuration must be passed as a filename string, JSON string, or dictionary, or provided in '"
                    + DEFAULT_CRED_FILENAME + "' or '" + DEFAULT_CRED_PATH +
                    "'.")

    native_client = globus_sdk.NativeAppAuthClient(NATIVE_CLIENT_ID,
                                                   app_name=creds["app_name"])

    servs = []
    for serv in creds.get("services", []):
        serv = serv.lower().strip()
        if type(serv) is str:
            servs += serv.split(" ")
        else:
            servs += list(serv)
    scopes = " ".join([SCOPES[sc] for sc in servs])

    all_tokens = _get_tokens(native_client,
                             scopes,
                             creds["app_name"],
                             force_refresh=clear_old_tokens)

    clients = {}
    if "transfer" in servs:
        transfer_authorizer = globus_sdk.RefreshTokenAuthorizer(
            all_tokens["transfer.api.globus.org"]["refresh_token"],
            native_client)
        clients["transfer"] = globus_sdk.TransferClient(
            authorizer=transfer_authorizer)
    if "search_ingest" in servs:
        ingest_authorizer = globus_sdk.RefreshTokenAuthorizer(
            all_tokens["search.api.globus.org"]["refresh_token"],
            native_client)
        clients["search_ingest"] = SearchClient(
            default_index=(creds.get("index", None)
                           or kwargs.get("index", None)),
            authorizer=ingest_authorizer)
    elif "search" in servs:
        search_authorizer = globus_sdk.RefreshTokenAuthorizer(
            all_tokens["search.api.globus.org"]["refresh_token"],
            native_client)
        clients["search"] = SearchClient(
            default_index=(creds.get("index", None)
                           or kwargs.get("index", None)),
            authorizer=search_authorizer)
    if "mdf" in servs:
        mdf_authorizer = globus_sdk.RefreshTokenAuthorizer(
            all_tokens["data.materialsdatafacility.org"]["refresh_token"],
            native_client)
        clients["mdf"] = mdf_authorizer

    return clients
Exemplo n.º 4
0
def login(globus_client_id):
    token = _login(globus_client_id, refresh_tokens=False)
    authorizer = globus.AccessTokenAuthorizer(token['transfer_token'])
    tc = globus.TransferClient(authorizer=authorizer)
    return tc
Exemplo n.º 5
0
    def _authTransferStart(self):
        """     _authTransferStart(self)
        DESCRIPTION:
            This function handles all of the administrative details of initializing self's authorization and transfer client
            objects. Globus requires a specific set of steps to be completed in order to authorize transfer requests. The user
            may be required to visit some URLs in order to retrieve some authorization tokens for the Globus Python SDK.
        ARGUMENTS:
            self
        EFFECTS:
            Creates two files, self._refreshPath and self._clientIDPath if needed in order to save the refresh token and
            the Globus client ID.
        RETURN:
            tc (TransferClient)         -- The TransferClient object returned by Globus. This is used to submit transfer
                                           requests.
            authorizer (RefreshTokenAuthorizer)         -- The Authorizer object from Globus used to create the transfer
                                                           client.
        """
        if not os.path.isfile(self._clientIDPath):
            URL = "http://globus-sdk-python.readthedocs.io/en/latest/tutorial/"
            print(
                "Please go to this URL and follow steps 1 and 2 to obtain a Client ID: {}"
                .format(URL))
            self._clientID = raw_input("Please enter the Client ID: ")
            self._clientID.strip()

            # Save the Client ID in a file
            with open(self._clientIDPath, "w") as f:
                f.write(self._clientID)

        else:
            # Open the Client ID file and read it in
            with open(self._clientIDPath, "r") as f:
                self._clientID = f.readline().strip()

        client = globus_sdk.NativeAppAuthClient(self._clientID)

        if not self.refreshIsValid():
            # The refresh token either doesn't exist or it's not valid

            client.oauth2_start_flow(refresh_tokens=True)

            print('Please go to this URL and login: {0}'.format(
                client.oauth2_get_authorize_url()))

            get_input = getattr(__builtins__, 'raw_input', input)
            auth_code = get_input('Please enter the code here: ').strip()
            token_response = client.oauth2_exchange_code_for_tokens(auth_code)

            # Get the data from the transfer API
            globus_transfer_data = token_response.by_resource_server[
                'transfer.api.globus.org']

            # Get the refresh token
            transfer_rt = globus_transfer_data['refresh_token']

            # Save the refresh token
            with open(self._refreshPath, 'w') as f:
                f.write(transfer_rt)

            # Get the access token
            transfer_at = globus_transfer_data['access_token']
            # Get the expiration time
            expires_at_s = globus_transfer_data['expires_at_seconds']

            # Now we've got the data we need, but what do we do?
            # That "GlobusAuthorizer" from before is about to come to the rescue

            # Create a refresh authorizer
            authorizer = globus_sdk.RefreshTokenAuthorizer(
                transfer_rt,
                client,
                access_token=transfer_at,
                expires_at=expires_at_s)

            # and try using `tc` to make TransferClient calls. Everything should just
            # work -- for days and days, months and months, even years
            tc = globus_sdk.TransferClient(authorizer=authorizer)

            # Prompt the user if the endpoints need to be activated
            self._autoActivate(self._srcEndpoint, tc)
            self._autoActivate(self._destEndpoint, tc)

        else:
            refreshToken = self._getRefreshToken()
            # authResponse is an authorizer...
            authorizer = globus_sdk.RefreshTokenAuthorizer(
                refreshToken, client)
            tc = globus_sdk.TransferClient(authorizer=authorizer)

        return {'tc': tc, 'authorizer': authorizer}
Exemplo n.º 6
0
def confidential_login(credentials=None):
    """Login to Globus services as a confidential client (a client with its own login information).

    Arguments:
    credentials (str or dict): A string filename, string JSON, or dictionary with credential and config information.
                               By default, looks in ~/mdf/credentials/confidential_globus_login.json.
        Contains:
        client_id (str): The ID of the client.
        client_secret (str): The client's secret for authentication.
        services (list of str): Services to authenticate with (can be transfer, search, search_ingest, or mdf).
        index: The default Search index. Only required if services contains 'search' or 'search_ingest'.

    Returns:
    dict: The clients and authorizers requested, indexed by service name.
          For example, if login() is told to auth with 'search' then the search client will be in the 'search' field.
    """
    DEFAULT_CRED_FILENAME = "confidential_globus_login.json"
    DEFAULT_CRED_PATH = os.path.expanduser("~/mdf/credentials")
    SCOPES = {
        "transfer": "urn:globus:auth:scope:transfer.api.globus.org:all",
        "search": "urn:globus:auth:scope:search.api.globus.org:search",
        "search_ingest": "urn:globus:auth:scope:search.api.globus.org:all",
        "mdf":
        "urn:globus:auth:scope:data.materialsdatafacility.org:all"  # urn:globus:auth:scope:api.materialsdatafacility.org:all"
    }
    # Read credentials
    if type(credentials) is str:
        try:
            with open(credentials) as cred_file:
                creds = json.load(cred_file)
        except IOError:
            try:
                creds = json.loads(credentials)
            except json.JSONDecodeError:
                raise ValueError("Credentials unreadable or missing")
    elif type(credentials) is dict:
        creds = credentials
    else:
        try:
            with open(os.path.join(os.getcwd(),
                                   DEFAULT_CRED_FILENAME)) as cred_file:
                creds = json.load(cred_file)
        except IOError:
            try:
                with open(
                        os.path.join(DEFAULT_CRED_PATH,
                                     DEFAULT_CRED_FILENAME)) as cred_file:
                    creds = json.load(cred_file)
            except IOError:
                raise ValueError(
                    "Credentials/configuration must be passed as a filename string, JSON string, or dictionary, or provided in '"
                    + DEFAULT_CRED_FILENAME + "' or '" + DEFAULT_CRED_PATH +
                    "'.")

    conf_client = globus_sdk.ConfidentialAppAuthClient(creds["client_id"],
                                                       creds["client_secret"])
    servs = []
    for serv in creds["services"]:
        serv = serv.lower().strip()
        if type(serv) is str:
            servs += serv.split(" ")
        else:
            servs += list(serv)
    scopes = " ".join([SCOPES[sc] for sc in servs])

    conf_authorizer = globus_sdk.ClientCredentialsAuthorizer(
        conf_client, scopes)

    clients = {}
    if "transfer" in servs:
        clients["transfer"] = globus_sdk.TransferClient(
            authorizer=conf_authorizer)
    if "search_ingest" in servs:
        clients["search_ingest"] = SearchClient(default_index=creds.get(
            "index", None),
                                                authorizer=conf_authorizer)
    elif "search" in servs:
        clients["search"] = SearchClient(default_index=creds.get(
            "index", None),
                                         authorizer=conf_authorizer)
    if "mdf" in servs:
        clients["mdf"] = conf_authorizer

    return clients
Exemplo n.º 7
0
def direct(
        files,
        force=False,
        local_path_prefix=sdconfig.sandbox_folder,
        verify_checksum=False,
        network_bandwidth_test=False,
        debug=True,
        verbosity=0):
    """
    Returns:
        a list of files that cannot be transferred by Globus because
        they haven't been published with globus: or gsiftp: access.
        After all Globus transfer jobs are complete, Synda will download
        the files using the HTTP protocol.
    """

    globus_transfers = {}
    """
    globus_transfers = {
        <src_endpoint>: {
            "items": [
                {
                    "src_path": <src_path>,
                    "dst_path": <dst_path>
                }...
            ],
            "task_id": <task_id>
        }
    }
    """
    non_globus_files = []

    for file_ in files:
        if file_.get("attached_parameters").get("protocol") != sdconst.TRANSFER_PROTOCOL_GLOBUS:
            non_globus_files.append(file_)
            continue
        src_endpoint, src_path, path = map_to_globus(file_.get("url"))
        if src_endpoint is None:
            non_globus_files.append(file_)
            continue
        dst_path = os.path.join(dst_directory, file_.get("dataset_path"), file_.get("filename"))
        if src_endpoint not in globus_transfers:
            globus_transfers[src_endpoint] = {"task_id": None, "items": []}
        globus_transfers.get(src_endpoint).get("items").append({
                "src_path": src_path,
                "dst_path": dst_path
        })
        sdlog.info("SDDMGLOB-001", "src_endpoint: %s, src_path: %s, dst_path: %s" % (src_endpoint, src_path, dst_path))

    # create a TransferClient object
    authorizer = get_native_app_authorizer(client_id=client_id)
    tc = globus_sdk.TransferClient(authorizer=authorizer)

    for src_endpoint in globus_transfers:

        # activate the ESGF endpoint
        resp = tc.endpoint_autoactivate(src_endpoint, if_expires_in=36000)
        if resp["code"] == "AutoActivationFailed":
            requirements_data = fill_delegate_proxy_activation_requirements(
                    resp.data, sdconfig.esgf_x509_proxy)
            r = tc.endpoint_activate(src_endpoint, requirements_data)
            if r["code"] != "Activated.ClientProxyCredential":
                sdlog.error("SDGLOBUS-028", "Error: Cannot activate the source endpoint: (%s)" % src_endpoint)
                raise FatalException()

        # submit a transfer job
        td = globus_sdk.TransferData(tc, src_endpoint, dst_endpoint)

        for item in globus_transfers.get(src_endpoint).get("items"):
            td.add_item(item.get("src_path"), item.get("dst_path"))

        try:
            task = tc.submit_transfer(td)
            task_id = task.get("task_id")
            print("Submitted Globus transfer: {}".format(task_id))
            globus_transfers.get(src_endpoint)["task_id"] = task_id
        except Exception as e:
            raise Exception("Globus transfer from {} to {} failed due to error: {}".format(
                src_endpoint, dst_endpoint, e))

    # monitor the transfer jobs
    threads = []
    for src_endpoint in globus_transfers:
        task_id = globus_transfers.get(src_endpoint).get("task_id")
        thread = threading.Thread(target=globus_wait, args=(tc, task_id, src_endpoint,))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    return non_globus_files
def share_data(args):

    user_source_endpoint = args.source_endpoint or source_endpoint
    user_shared_endpoint = args.shared_endpoint or shared_endpoint
    if not user_shared_endpoint:
        eprint('Invalid shared endpoint')
        sys.exit(1)

    user_source_path = args.source_path or source_path
    user_destination_path = args.destination_path or destination_path
    if not user_source_path.startswith('/'):
        eprint('Source path must be absolute')
        sys.exit(1)
    if not user_destination_path.startswith('/'):
        eprint('Destination path must be absolute')
        sys.exit(1)

    if args.auth == 'native':
        # get an authorizer if it is a Native App
        authorizer = get_native_app_authorizer(client_id=CLIENT_ID)
    elif args.auth == 'client-credentials':
        secret = args.client_secret or CLIENT_SECRET
        if not secret:
            eprint('--auth client-credentials chosen, but no secret provided!'
                   ' Set "--client-secret <your secret>"')
            sys.exit(1)
        # get an authorizer if it is a Confidential App
        authorizer = get_confidential_app_authorizer(client_id=CLIENT_ID,
                                                     client_secret=secret)
    else:
        raise ValueError('Invalid Authenticator, this script only understands '
                         'Native and Client Credential')

    # look for an identity uuid for the specified identity username
    username_uuid = None
    if args.username:
        ac = globus_sdk.AuthClient(authorizer=authorizer)
        r = ac.get_identities(usernames=args.username)
        if not len(r['identities']):
            eprint('No such identity username \'{}\''.format(args.username))
            exit(1)
        username_uuid = r['identities'][0]['id']

    # create a TransferClient object
    tc = globus_sdk.TransferClient(authorizer=authorizer)

    # check if a destination directory exists at all
    try:
        tc.operation_ls(user_shared_endpoint, path=user_destination_path)
    except TransferAPIError as e:
        eprint(e)
        sys.exit(1)

    dirname, leaf = os.path.split(user_source_path)
    if leaf == '':
        _, leaf = os.path.split(dirname)
    destination_directory = os.path.join(user_destination_path, leaf) + '/'
    """
    check if a directory with the same name was already transferred to the
    destination path if it was and --delete option is specified, delete the
    directory
    """
    try:
        tc.operation_ls(user_shared_endpoint, path=destination_directory)
        if not args.delete:
            eprint('Destination directory exists. Delete the directory or '
                   'use --delete option')
            sys.exit(1)
        print('Destination directory, {}, exists and will be deleted'.format(
            destination_directory))
        ddata = globus_sdk.DeleteData(tc,
                                      user_shared_endpoint,
                                      label='Share Data Example',
                                      recursive=True)
        ddata.add_item(destination_directory)
        print('Submitting a delete task')
        task = tc.submit_delete(ddata)
        print('\ttask_id: {}'.format(task['task_id']))
        tc.task_wait(task['task_id'])
    except TransferAPIError as e:
        if e.code != u'ClientError.NotFound':
            eprint(e)
            sys.exit(1)

    # create a destination directory
    try:
        print(
            'Creating destination directory {}'.format(destination_directory))
        tc.operation_mkdir(user_shared_endpoint, destination_directory)
    except TransferAPIError as e:
        eprint(e)
        sys.exit(1)

    # grant group/user read access to the destination directory
    if args.user_uuid:
        rule_data = {
            "DATA_TYPE": "access",
            "principal_type": "identity",
            "principal": args.user_uuid,
            "path": destination_directory,
            "permissions": "r",
        }

        try:
            print(
                'Granting user, {}, read access to the destination directory'.
                format(args.user_uuid))
            tc.add_endpoint_acl_rule(user_shared_endpoint, rule_data)
        except TransferAPIError as e:
            if e.code != u'Exists':
                eprint(e)
                sys.exit(1)

    if username_uuid:
        rule_data = {
            "DATA_TYPE": "access",
            "principal_type": "identity",
            "principal": username_uuid,
            "path": destination_directory,
            "permissions": "r",
        }

        try:
            print(
                'Granting user, {}, read access to the destination directory'.
                format(username_uuid))
            tc.add_endpoint_acl_rule(user_shared_endpoint, rule_data)
        except TransferAPIError as e:
            if e.code != u'Exists':
                eprint(e)
                sys.exit(1)

    if args.group_uuid:
        rule_data = {
            "DATA_TYPE": "access",
            "principal_type": "group",
            "principal": args.group_uuid,
            "path": destination_directory,
            "permissions": "r",
        }

        try:
            print('Granting group, {}, read access to '.format(
                args.group_uuid))
            tc.add_endpoint_acl_rule(user_shared_endpoint, rule_data)
        except TransferAPIError as e:
            if e.code != u'Exists':
                eprint(e)
                sys.exit(1)

    # transfer data - source directory recursively
    tdata = globus_sdk.TransferData(tc,
                                    user_source_endpoint,
                                    user_shared_endpoint,
                                    label='Share Data Example')
    tdata.add_item(user_source_path, destination_directory, recursive=True)
    try:
        print('Submitting a transfer task')
        task = tc.submit_transfer(tdata)
    except TransferAPIError as e:
        eprint(e)
        sys.exit(1)
    print('\ttask_id: {}'.format(task['task_id']))
    print('You can monitor the transfer task programmatically using Globus SDK'
          ', or go to the Web UI, https://www.globus.org/app/activity/{}.'.
          format(task['task_id']))
Exemplo n.º 9
0
def clean():

    # constants
    SDK_USER_ID = "84942ca8-17c4-4080-9036-2f58e0093869"
    GO_EP1_ID = "ddb59aef-6d04-11e5-ba46-22000b92c6ec"
    GO_EP2_ID = "ddb59af0-6d04-11e5-ba46-22000b92c6ec"
    # TODO: remove EP3 when EP1 and EP2 support symlinks
    GO_EP3_ID = "4be6107f-634d-11e7-a979-22000bf2d287"
    CLIENT_ID = 'd0f1d9b0-bd81-4108-be74-ea981664453a'
    SCOPES = 'urn:globus:auth:scope:transfer.api.globus.org:all'
    get_input = getattr(__builtins__, 'raw_input', input)

    # create an authorized transfer client
    client = globus_sdk.NativeAppAuthClient(client_id=CLIENT_ID)
    client.oauth2_start_flow(requested_scopes=SCOPES)
    url = client.oauth2_get_authorize_url()

    print("Login with SDK Tester: \n{0}".format(url))
    auth_code = get_input("Enter auth code: ").strip()

    # get tokens and make a transfer client
    tokens = client.oauth2_exchange_code_for_tokens(
        auth_code).by_resource_server
    globus_transfer_data = tokens['transfer.api.globus.org']
    transfer_rt = globus_transfer_data['refresh_token']
    transfer_at = globus_transfer_data['access_token']
    expires_at_s = globus_transfer_data['expires_at_seconds']

    authorizer = globus_sdk.RefreshTokenAuthorizer(transfer_rt,
                                                   client,
                                                   access_token=transfer_at,
                                                   expires_at=expires_at_s)
    tc = globus_sdk.TransferClient(authorizer=authorizer)

    # prevent accidental cleaning of a personal account
    auth_client = globus_sdk.AuthClient(authorizer=authorizer)
    res = auth_client.get('/p/whoami')
    if res['identities'][0]["id"] != SDK_USER_ID:  # assume the primary ID
        print("The primary ID was not the SDK Tester, stopping clean")
        return

    # now clean test assets

    # clean SDK Tester's home /~/ on go#ep1 go#ep2 and go#ep3
    ep_ids = [GO_EP1_ID, GO_EP2_ID, GO_EP3_ID]
    task_ids = []
    file_deletions = 0
    for ep_id in ep_ids:
        kwargs = {"notify_on_succeeded": False}  # prevent email spam
        ddata = globus_sdk.DeleteData(tc, ep_id, recursive=True, **kwargs)
        r = tc.operation_ls(ep_id)
        for item in r:
            ddata.add_item("/~/" + item["name"])
            print("deleting {0}: {1}".format(item["type"], item["name"]))
            file_deletions += 1
        if len(ddata["DATA"]):
            r = tc.submit_delete(ddata)
            task_ids.append(r["task_id"])

    # clean SDK Tester's bookmarks
    bookmark_deletions = 0
    r = tc.bookmark_list()
    for bookmark in r:
        tc.delete_bookmark(bookmark["id"])
        print("deleting bookmark: {0}".format(bookmark["name"]))
        bookmark_deletions += 1

    # clean endpoints owned by SDK Tester
    endpoint_deletions = 0
    cleaning = True
    while (cleaning):
        cleaning = False
        r = tc.endpoint_search(filter_scope="my-endpoints", num_results=None)
        for ep in r:
            tc.delete_endpoint(ep["id"])
            print("deleting endpoint: {0}".format(ep["display_name"]))
            endpoint_deletions += 1
            cleaning = True

    # wait for deletes to complete
    for task_id in task_ids:
        tc.task_wait(task_id, polling_interval=1)

    print("{0} files or folders cleaned".format(file_deletions))
    print("{0} endpoints cleaned".format(endpoint_deletions))
    print("{0} bookmarks cleaned".format(bookmark_deletions))
Exemplo n.º 10
0
def get_globus_tc(transfer_token):

    authorizer = globus_sdk.AccessTokenAuthorizer(transfer_token)
    tc = globus_sdk.TransferClient(authorizer=authorizer)
    return tc
Exemplo n.º 11
0
def client():
    return globus_sdk.TransferClient()
Exemplo n.º 12
0
def main(args):

    # Obtain Globus tokens
    cli = NativeClient(client_id=client_id, app_name="Data Stager")
    cli.login(no_local_server=True,
              requested_scopes=scopes,
              refresh_tokens=True,
              force=args.login)
    authorizers = cli.get_authorizers()
    if args.login:
        sys.exit(0)

    # Determine source and destination Globus endpoints and directories
    source_endpoint = args.source
    hostname = socket.gethostname()
    if not source_endpoint:
        source_endpoint = None
        for h, ep in hostname_endpoint.items():
            if hostname.startswith(h):
                source_endpoint = ep
                break
    if not source_endpoint:
        logger.error("The source Globus endpoint is required")
        sys.exit(1)

    try:
        destination_endpoint, destination_dir = args.destination.split(":", 1)
    except ValueError:
        logger.error("Globus destination endpoint and path are incorrect")
        sys.exit(1)
    for name, ep in name_endpoint.items():
        if destination_endpoint == name:
            destination_endpoint = ep
            break

    # Try to activate source and destination Globus endpoints
    tc = globus_sdk.TransferClient(
        authorizer=authorizers["transfer.api.globus.org"])
    resp = tc.endpoint_autoactivate(source_endpoint, if_expires_in=36000)
    if resp["code"] == "AutoActivationFailed":
        logger.error(
            "The source endpoint is not active. Please go to https://app.globus.org/file-manager/collections/{} to activate the endpoint."
            .format(source_endpoint))
        sys.exit(1)
    logger.info("The source Globus endpoint has been activated")

    resp = tc.endpoint_autoactivate(destination_endpoint, if_expires_in=36000)
    if resp["code"] == "AutoActivationFailed":
        logger.error(
            "The destination endpoint is not active. Please go to https://app.globus.org/file-manager/collections/{} to activate the endpoint."
            .format(destination_endpoint))
        sys.exit(1)
    logger.info("The destination Globus endpoint has been activated")

    # Load pattern file if provided
    global patterns
    if args.pattern_file:
        with open(args.pattern_file, "r") as f:
            patterns = json.load(f)

    components = []
    if args.component:
        components = args.component.split(",")

    # Data file patterns
    file_patterns = []
    for c in components:
        p = patterns.get(c)
        if isinstance(p, str):
            file_patterns.append(p)
        elif isinstance(p, list):
            file_patterns = file_patterns + p
    file_patterns = file_patterns + args.files
    if not file_patterns:
        file_patterns = ["*"]
    logger.debug("File patterns: {}".format(file_patterns))

    # Restart file patterns
    p = patterns.get("restart")
    if isinstance(p, str):
        restart_patterns = [p]
    elif isinstance(p, list):
        restart_patterns = p
    logger.debug("Restart file patterns: {}".format(restart_patterns))

    # Namelist file patterns
    p = patterns.get("namelist")
    if isinstance(p, str):
        namelist_patterns = [p]
    elif isinstance(p, list):
        namelist_patterns = p
    logger.debug("Namelist file patterns: {}".format(namelist_patterns))

    # Create temporary directory for all zstash files, etc.
    tmp_directory = tempfile.mkdtemp(prefix="stager-", dir=".")
    os.chdir(tmp_directory)

    # Download and open database
    logger.info('Opening index database')
    config.hpss = args.zstash
    hpss_get(config.hpss, DB_FILENAME)
    con = sqlite3.connect(DB_FILENAME, detect_types=sqlite3.PARSE_DECLTYPES)
    cur = con.cursor()

    # Retrieve some configuration settings from database
    for attr in dir(config):
        value = getattr(config, attr)
        if not callable(value) and not attr.startswith("__"):
            cur.execute(u"select value from config where arg=?", (attr, ))
            value = cur.fetchone()[0]
            setattr(config, attr, value)
    config.maxsize = int(config.maxsize)
    config.keep = bool(int(config.keep))

    # The command line arg should always have precedence
    config.keep = True
    if args.zstash is not None:
        config.hpss = args.zstash

    logger.info("Local path: {}".format(config.path))
    logger.info("HPSS path: {}".format(config.hpss))
    logger.info("Max size: {}".format(config.maxsize))

    # Find matching files
    file_matches = []
    for p in file_patterns:
        cur.execute(u"select * from files where name GLOB ? or tar GLOB ?",
                    (p, p))
        file_matches = file_matches + cur.fetchall()

    restart_matches = []
    for p in restart_patterns:
        cur.execute(
            u"select * from files where name GLOB ? or tar GLOB ? limit 1",
            (p, p))
        restart_matches = cur.fetchall()
        if restart_matches:
            break

    namelist_matches = []
    for p in namelist_patterns:
        cur.execute(
            u"select * from files where name GLOB ? or tar GLOB ? limit 1",
            (p, p))
        namelist_matches = cur.fetchall()
        if namelist_matches:
            break

    logger.debug("Matching files: {}".format(file_matches))
    logger.debug("Matching restart file: {}".format(restart_matches))
    logger.debug("Matching namelist file: {}".format(namelist_matches))

    matches = file_matches + restart_matches + namelist_matches

    # Sort by the filename, tape (so the tar archive), and order within tapes (offset).
    matches.sort(key=lambda x: (x[1], x[5], x[6]))
    """
    Based off the filenames, keep only the last instance of a file.
    This is because we may have different versions of the same file across many tars.
    """
    insert_idx, iter_idx = 0, 1
    for iter_idx in range(1, len(matches)):
        # If the filenames are unique, just increment insert_idx.
        # iter_idx will increment after this iteration.
        if matches[insert_idx][1] != matches[iter_idx][1]:
            insert_idx += 1
        # Always copy over the value at the correct location.
        matches[insert_idx] = matches[iter_idx]

    matches = matches[:insert_idx + 1]
    logger.info(
        "{} matching files including restart and namelist files".format(
            len(matches)))

    # Sort by tape and offset, so that we make sure that extract the files by tape order.
    matches.sort(key=lambda x: (x[5], x[6]))

    # Retrieve from tapes
    if args.workers > 1:
        logger.debug("Running zstash with multiprocessing")
        failures = multiprocess_extract(args.workers, matches, True)
    else:
        failures = extractFiles(matches, True)

    # Close database
    logger.debug('Closing index database')
    con.close()

    if failures:
        logger.error("Encountered an error for files:")
        for fail in failures:
            logger.error("{} in {}".format(fail[1], fail[5]))
        broken_tars = sorted(set([f[5] for f in failures]))
        logger.error("The following tar archives had errors:")
        for tar in broken_tars:
            logger.error(tar)
        sys.exit(1)

    # Create a manifest file
    manifest = []
    for m in matches:
        manifest.append({"filename": m[1], "length": m[2], "md5": m[4]})
    if args.m:
        manifest_name = args.m + "-"
    manifest_name += "manifest.json"
    with open(manifest_name, "w+") as f:
        f.write(json.dumps(manifest))

    # Transfer the files downloaded from the zstash archive
    if args.t:
        label = args.t
    else:
        label = "E3SM Data Stager on {}".format(hostname)
    td = globus_sdk.TransferData(tc,
                                 source_endpoint,
                                 destination_endpoint,
                                 label=label)

    cwd = os.getcwd()
    source_path = os.path.join(cwd, manifest_name)
    destination_path = os.path.join(destination_dir, manifest_name)
    td.add_item(source_path, destination_path)
    for m in matches:
        source_path = os.path.join(cwd, m[1])
        destination_path = os.path.join(destination_dir, m[1])
        td.add_item(source_path, destination_path)

    try:
        task = tc.submit_transfer(td)
        task_id = task.get("task_id")
        logger.info("Submitted Globus transfer: {}".format(task_id))
    except Exception as e:
        logger.error("Globus transfer failed due to error: {}".format(e))
        sys.exit(1)

    if not args.block:
        logger.info(
            "You can monitor the status of the transfer at https://app.globus.org/activity/{}"
            .format(task_id))
        sys.exit(0)
    """
    A Globus transfer job (task) can be in one of the three states: ACTIVE, SUCCEEDED, FAILED.
    The Data Stager polls a status of the transfer job (task) from the Globus Transfer service
    every 15 seconds with 60 second timeout limit. If the task is ACTIVE after time runs out,
    'tc.task_wait()' returns False, and True otherwise.
    """
    last_event_time = None
    while not tc.task_wait(task_id, 60, 15):
        task = tc.get_task(task_id)
        # Get the last error Globus event
        events = tc.task_event_list(task_id,
                                    num_results=1,
                                    filter="is_error:1")
        try:
            event = next(events)
        except StopIteration:
            continue
        # Log the error event if it was not yet logged
        if event["time"] != last_event_time:
            last_event_time = event["time"]
            logger.warn(
                "Non-critical Globus Transfer error event: {} at {}".format(
                    event["description"], event["time"]))
            logger.warn("Globus Transfer error details: {}".format(
                event["details"]))
    """
    The Globus transfer job (task) has been terminated (is not ACTIVE). Check if the transfer
    SUCCEEDED or FAILED.
    """
    task = tc.get_task(task_id)
    if task["status"] == "SUCCEEDED":
        logger.info("Globus transfer {} succeeded".format(task_id))
    else:
        logger.error("Globus Transfer task: {}".format(task_id))
        events = tc.task_event_list(task_id,
                                    num_results=1,
                                    filter="is_error:1")
        event = next(events)
        logger.error("Globus transfer {} failed due to error: {}".format(
            task_id, event["details"]))
        sys.exit(1)

    if args.e:
        logger.info("Deleting downloaded zstash archives and extracted files")
        os.chdir("..")
        shutil.rmtree(tmp_directory)
Exemplo n.º 13
0
    native_auth_client.oauth2_start_flow()

    print("Login Here:\n\n{0}".format(
        native_auth_client.oauth2_get_authorize_url()))

    # Authorization code
    auth_code = str(input("Input auth code:"))

    # Create transfer client
    token_response = native_auth_client.oauth2_exchange_code_for_tokens(
        auth_code)
    transfer_access_token = token_response.by_resource_server[
        'transfer.api.globus.org']['access_token']
    transfer_authorizer = globus_sdk.AccessTokenAuthorizer(
        transfer_access_token)
    transfer_client = globus_sdk.TransferClient(authorizer=transfer_authorizer)

    deep_blue_crawl_df = pd.read_csv(args.crawl_csv)
    file_uuid_mapping = dict()
    for index, row in deep_blue_crawl_df.iterrows():
        file_uuid_mapping[row[0]] = row[4]

    # Filter files
    filtered_files = deep_blue_crawl_df[
        deep_blue_crawl_df.file_uuid.str.endswith(
            args.compression_extension)].sort_values(by=["size_bytes"])

    max_size_threshold = args.max_transfer_size  # Just to make sure we don't blow up the Jetstream instance
    transferred_files = []
    batch_n = 1
def submission_driver(metadata, sub_conf, source_id, access_token, user_id):
    """The driver function for MOC.
    Modifies the status database as steps are completed.

    Arguments:
    metadata (dict): The JSON passed to /submit.
    sub_conf (dict): Submission configuration information.
    source_id (str): The source name of this submission.
    access_token (str): The Globus Auth access token for the submitting user.
    user_id (str): The Globus ID of the submitting user.
    """
    # Setup
    utils.update_status(source_id, "sub_start", "P", except_on_fail=True)
    utils.modify_status_entry(source_id, {
        "pid": os.getpid(),
        "hibernating": False
    },
                              except_on_fail=True)
    try:
        # Connect auth
        # CAAC required for user auth later
        mdf_conf_client = globus_sdk.ConfidentialAppAuthClient(
            CONFIG["API_CLIENT_ID"], CONFIG["API_CLIENT_SECRET"])
        mdf_creds = mdf_toolbox.dict_merge(CONFIG["GLOBUS_CREDS"],
                                           {"services": ["transfer"]})
        mdf_clients = mdf_toolbox.confidential_login(**mdf_creds)
        mdf_transfer_client = mdf_clients["transfer"]

        # User auth
        # When coming from curation, the access token (from the curator) is not used
        access_token = access_token.replace("Bearer ", "")
        dependent_grant = mdf_conf_client.oauth2_get_dependent_tokens(
            access_token)
        # Get specifically Transfer's access token
        for grant in dependent_grant.data:
            if grant["resource_server"] == "transfer.api.globus.org":
                user_transfer_token = grant["access_token"]
        user_transfer_authorizer = globus_sdk.AccessTokenAuthorizer(
            user_transfer_token)
        user_transfer_client = globus_sdk.TransferClient(
            authorizer=user_transfer_authorizer)
    except Exception as e:
        utils.update_status(source_id,
                            "sub_start",
                            "F",
                            text=repr(e),
                            except_on_fail=True)
        utils.complete_submission(source_id)
        return

    # Cancel the previous version(s)
    source_info = utils.split_source_id(source_id)
    scan_res = utils.scan_table(table_name="status",
                                fields=["source_id", "active"],
                                filters=[("source_id", "^",
                                          source_info["source_name"]),
                                         ("source_id", "<", source_id)])
    if not scan_res["success"]:
        utils.update_status(source_id,
                            "sub_start",
                            "F",
                            text=scan_res["error"],
                            except_on_fail=True)
        utils.complete_submission(source_id)
        return

    old_source_ids = [
        oldsub["source_id"] for oldsub in scan_res["results"]
        if oldsub["active"]
    ]
    if old_source_ids:
        utils.update_status(
            source_id,
            "sub_start",
            "M",
            text=("The following submissions will be cancelled: {}".format(
                old_source_ids)),
            except_on_fail=True)
        utils.update_status(source_id, "old_cancel", "P", except_on_fail=True)

        for old_source_id in old_source_ids:
            cancel_res = utils.cancel_submission(old_source_id, wait=True)
            if not cancel_res["stopped"]:
                utils.update_status(
                    source_id,
                    "sub_start",
                    "F",
                    text=cancel_res.get(
                        "error", ("Unable to cancel previous "
                                  "submission '{}'").format(old_source_id)),
                    except_on_fail=True)
                utils.complete_submission(source_id)
                return
            if cancel_res["success"]:
                logger.info("{}: Cancelled source_id {}".format(
                    source_id, old_source_id))
            else:
                logger.debug("{}: Stopped source_id {}".format(
                    source_id, old_source_id))
        utils.update_status(source_id, "old_cancel", "S", except_on_fail=True)
    else:
        utils.update_status(source_id, "sub_start", "S", except_on_fail=True)
        utils.update_status(source_id, "old_cancel", "N", except_on_fail=True)

    # NOTE: Cancellation point
    if utils.read_table("status", source_id).get("status",
                                                 {}).get("cancelled"):
        logger.debug("{}: Cancel signal acknowledged".format(source_id))
        utils.complete_submission(source_id)
        return

    local_path = os.path.join(CONFIG["LOCAL_PATH"], source_id) + "/"
    feedstock_file = os.path.join(CONFIG["FEEDSTOCK_PATH"],
                                  source_id + ".json")
    curation_state_file = os.path.join(CONFIG["CURATION_DATA"],
                                       source_id + ".json")
    service_data = os.path.join(CONFIG["SERVICE_DATA"], source_id) + "/"
    os.makedirs(service_data, exist_ok=True)
    num_files = 0
    # Curation skip point
    if type(sub_conf["curation"]) is not str:
        # If we're extracting, download data locally, then set canon source to local
        # This allows non-Globus sources (because to download to Connect's EP)
        if not sub_conf["no_extract"]:
            utils.update_status(source_id,
                                "data_download",
                                "P",
                                except_on_fail=True)
            try:
                # Download from user
                for dl_res in utils.download_data(
                        user_transfer_client,
                        sub_conf["data_sources"],
                        CONFIG["LOCAL_EP"],
                        local_path,
                        admin_client=mdf_transfer_client,
                        user_id=user_id):
                    if not dl_res["success"]:
                        msg = "During data download: " + dl_res["error"]
                        utils.update_status(source_id,
                                            "data_download",
                                            "T",
                                            text=msg,
                                            except_on_fail=True)
                if not dl_res["success"]:
                    raise ValueError(dl_res["error"])
                num_files = dl_res["total_files"]

            except Exception as e:
                utils.update_status(source_id,
                                    "data_download",
                                    "F",
                                    text=repr(e),
                                    except_on_fail=True)
                utils.complete_submission(source_id)
                return

            utils.update_status(
                source_id,
                "data_download",
                "M",
                text=(
                    "{} files will be grouped and extracted (from {} archives)"
                    .format(num_files, dl_res["num_extracted"])),
                except_on_fail=True)
            canon_data_sources = [
                "globus://{}{}".format(CONFIG["LOCAL_EP"], local_path)
            ]

        # If we're not extracting, set canon source to only source
        # Also create local dir with no data to "extract" for dataset entry
        else:
            utils.update_status(source_id,
                                "data_download",
                                "N",
                                except_on_fail=True)
            os.makedirs(local_path)
            canon_data_sources = sub_conf["data_sources"]

        # Move data from canon source(s) to canon dest (if different)
        utils.update_status(source_id,
                            "data_transfer",
                            "P",
                            except_on_fail=True)
        # If not extracting, set up user TC for backup use
        if sub_conf["no_extract"]:
            backup_user_id = user_id
            backup_user_client = user_transfer_client
        else:
            backup_user_id = None
            backup_user_client = None
        for data_source in canon_data_sources:
            if data_source != sub_conf["canon_destination"]:
                logger.debug("Data transfer: '{}' to '{}'".format(
                    data_source, sub_conf["canon_destination"]))
                try:
                    for backup_res in utils.backup_data(
                            mdf_transfer_client,
                            data_source,
                            sub_conf["canon_destination"],
                            acl=sub_conf["storage_acl"],
                            data_client=backup_user_client,
                            data_user=backup_user_id):
                        if not backup_res["success"]:
                            msg = ("During data download: {}".format(
                                backup_res.get("error", "Unknown error")))
                            utils.update_status(source_id,
                                                "data_transfer",
                                                "T",
                                                text=msg,
                                                except_on_fail=True)
                    if not backup_res["success"]:
                        raise ValueError(backup_res.get("error"))
                    elif not backup_res[
                            sub_conf["canon_destination"]]["success"]:
                        raise ValueError(
                            backup_res[sub_conf["canon_destination"]]["error"])
                except Exception as e:
                    err_text = (
                        "Transfer from '{}' to primary/canon destination '{}' failed: {}"
                        .format(data_source, sub_conf["canon_destination"],
                                str(e)))
                    utils.update_status(source_id,
                                        "data_transfer",
                                        "F",
                                        text=err_text,
                                        except_on_fail=True)
                    return
        utils.update_status(source_id,
                            "data_transfer",
                            "S",
                            except_on_fail=True)

        # Add file info data
        sub_conf["index"]["file"] = {
            "globus_host": sub_conf["canon_destination"],
            "http_host": utils.lookup_http_host(sub_conf["canon_destination"]),
            "local_path": local_path,
        }
        extract_params = {
            "dataset":
            metadata,
            "extractors":
            sub_conf["index"],
            "service_data":
            service_data,
            "feedstock_file":
            feedstock_file,
            "group_config":
            mdf_toolbox.dict_merge(sub_conf["extraction_config"],
                                   CONFIG["GROUPING_RULES"]),
            "validation_info": {
                "project_blocks": sub_conf.get("project_blocks", []),
                "required_fields": sub_conf.get("required_fields", []),
                "allowed_nulls": CONFIG["SCHEMA_NULLS"],
                "base_acl": sub_conf["acl"]
            }
        }

        # NOTE: Cancellation point
        if utils.read_table("status", source_id).get("status",
                                                     {}).get("cancelled"):
            logger.debug("{}: Cancel signal acknowledged".format(source_id))
            utils.complete_submission(source_id)
            return

        # Extract data
        utils.update_status(source_id, "extracting", "P", except_on_fail=True)
        try:
            extract_res = start_extractors(local_path, extract_params)
            if not extract_res["success"]:
                utils.update_status(source_id,
                                    "extracting",
                                    "F",
                                    text=extract_res["error"],
                                    except_on_fail=True)
                return
            dataset = extract_res["dataset"]
            num_records = extract_res["num_records"]
            num_groups = extract_res["num_groups"]
            extensions = extract_res["extensions"]
        except Exception as e:
            utils.update_status(source_id,
                                "extracting",
                                "F",
                                text=repr(e),
                                except_on_fail=True)
            utils.complete_submission(source_id)
            return
        else:
            utils.modify_status_entry(source_id, {"extensions": extensions})
            # If nothing in dataset, panic
            if not dataset:
                utils.update_status(source_id,
                                    "extracting",
                                    "F",
                                    text="Could not process dataset entry",
                                    except_on_fail=True)
                utils.complete_submission(source_id)
                return
            # If not extracting, show status as skipped
            # Also check if records were extracted inappropriately, flag error in log
            elif sub_conf.get("no_extract"):
                if num_records != 0:
                    logger.error(
                        "{}: Records extracted with no_extract flag ({} records)"
                        .format(source_id, num_records))
                utils.update_status(source_id,
                                    "extracting",
                                    "N",
                                    except_on_fail=True)
            else:
                utils.update_status(
                    source_id,
                    "extracting",
                    "M",
                    text=("{} metadata records extracted out of {} file groups"
                          .format(num_records, num_groups)),
                    except_on_fail=True)
            logger.debug("{}: {} entries extracted".format(
                source_id, num_records + 1))

        # NOTE: Cancellation point
        if utils.read_table("status", source_id).get("status",
                                                     {}).get("cancelled"):
            logger.debug("{}: Cancel signal acknowledged".format(source_id))
            utils.complete_submission(source_id)
            return

        ###################
        #  Curation step  #
        ###################
        # Trigger curation if required
        if sub_conf.get("curation"):
            utils.update_status(source_id,
                                "curation",
                                "P",
                                except_on_fail=True)
            # Create curation task in curation table
            with open(feedstock_file) as f:
                # Discard dataset entry
                f.readline()
                # Save first few records
                # Append the json-loaded form of records
                # The number of records should be at most the default number,
                # and less if less are present
                curation_records = []
                [
                    curation_records.append(json.loads(f.readline()))
                    for i in range(
                        min(CONFIG["NUM_CURATION_RECORDS"], num_records))
                ]
            curation_dataset = deepcopy(dataset)
            # Numbers can be extracted into Decimal by DynamoDB, which causes JSON errors
            curation_dataset["mdf"].pop("scroll_id", None)
            curation_dataset["mdf"].pop("version", None)
            curation_task = {
                "source_id":
                source_id,
                "allowed_curators":
                sub_conf.get("permission_groups", sub_conf["acl"]),
                "dataset":
                json.dumps(dataset),
                "sample_records":
                json.dumps(curation_records),
                "submission_info":
                sub_conf,
                "extraction_summary":
                ("{} records were extracted out of {} groups from {} files".
                 format(num_records, num_groups, num_files)),
                "curation_start_date":
                str(datetime.today())
            }
            # If no allowed curators or public allowed, set to public
            if (not curation_task["allowed_curators"]
                    or "public" in curation_task["allowed_curators"]):
                curation_task["allowed_curators"] = ["public"]

            # Create task in database
            create_res = utils.create_curation_task(curation_task)
            if not create_res["success"]:
                utils.update_status(source_id,
                                    "curation",
                                    "F",
                                    text=create_res.get(
                                        "error",
                                        "Unable to create curation task"),
                                    except_on_fail=True)
                return

            # Save state
            os.makedirs(CONFIG["CURATION_DATA"], exist_ok=True)
            with open(curation_state_file, 'w') as save_file:
                state_data = {
                    "source_id": source_id,
                    "sub_conf": sub_conf,
                    "dataset": dataset
                }
                json.dump(state_data, save_file)
                logger.debug("{}: Saved state for curation".format(source_id))

            # Trigger hibernation
            utils.modify_status_entry(source_id, {"hibernating": True},
                                      except_on_fail=True)
            return
        else:
            utils.update_status(source_id,
                                "curation",
                                "N",
                                except_on_fail=True)

    # Returning from curation
    # Submission accepted
    elif sub_conf["curation"].startswith("Accept"):
        # Save curation message
        curation_message = sub_conf["curation"]
        # Load state
        with open(curation_state_file) as save_file:
            state_data = json.load(save_file)
            # Verify source_ids match
            if state_data["source_id"] != source_id:
                logger.error("State data incorrect: '{}' is not '{}'".format(
                    state_data["source_id"], source_id))
                utils.update_status(source_id,
                                    "curation",
                                    "F",
                                    text="Submission corrupted",
                                    except_on_fail=True)
                return
            # Load state variables back
            sub_conf = state_data["sub_conf"]
            dataset = state_data["dataset"]
        logger.debug("{}: Loaded state from curation".format(source_id))
        # Delete state file
        try:
            os.remove(curation_state_file)
        except FileNotFoundError:
            utils.update_status(
                source_id,
                "curation",
                "F",
                text="Unable to cleanly load curation information",
                except_on_fail=True)
            return

        # Delete curation task
        delete_res = utils.delete_from_table("curation", source_id)
        if not delete_res["success"]:
            utils.update_status(source_id,
                                "curation",
                                "F",
                                text=delete_res.get("error",
                                                    "Curation cleanup failed"),
                                except_on_fail=True)
            return
        utils.update_status(source_id,
                            "curation",
                            "M",
                            text=curation_message,
                            except_on_fail=True)
    # Submission rejected
    elif sub_conf["curation"].startswith("Reject"):
        # Delete state file
        try:
            os.remove(curation_state_file)
        except FileNotFoundError:
            logger.error(
                "{}: Unable to delete curation state file '{}'".format(
                    source_id, curation_state_file))
        # Delete curation task
        delete_res = utils.delete_from_table("curation", source_id)
        if not delete_res["success"]:
            logger.error(
                "{}: Unable to delete rejected curation from database: {}".
                format(source_id, delete_res.get("error")))

        utils.update_status(source_id,
                            "curation",
                            "F",
                            text=sub_conf["curation"],
                            except_on_fail=True)
        return
    # Curation invalid
    else:
        utils.update_status(source_id,
                            "curation",
                            "F",
                            text="Unknown curation state: '{}'".format(
                                sub_conf["curation"]),
                            except_on_fail=True)
        return

    ###################
    #  Post-curation  #
    ###################

    # Integrations
    service_res = {}

    # NOTE: Cancellation point
    if utils.read_table("status", source_id).get("status",
                                                 {}).get("cancelled"):
        logger.debug("{}: Cancel signal acknowledged".format(source_id))
        utils.complete_submission(source_id)
        return

    # MDF Search (mandatory)
    utils.update_status(source_id, "ingest_search", "P", except_on_fail=True)
    search_config = sub_conf["services"].get("mdf_search", {})
    try:
        search_args = {
            "feedstock_file": feedstock_file,
            "source_id": source_id,
            "index": search_config.get("index", CONFIG["INGEST_INDEX"]),
            "delete_existing": True,
            "batch_size": CONFIG["SEARCH_BATCH_SIZE"]
        }
        search_res = utils.search_ingest(**search_args)
        if not search_res["success"]:
            utils.update_status(source_id,
                                "ingest_search",
                                "F",
                                text="; ".join(search_res["errors"]),
                                except_on_fail=True)
            return
    except Exception as e:
        utils.update_status(source_id,
                            "ingest_search",
                            "F",
                            text=repr(e),
                            except_on_fail=True)
        utils.complete_submission(source_id)
        return
    else:
        # Handle errors
        if len(search_res["errors"]) > 0:
            utils.update_status(
                source_id,
                "ingest_search",
                "F",
                text=(
                    "{} batches of records failed to ingest (up to {} records "
                    "total)").format(len(search_res["errors"]),
                                     (len(search_res["errors"]) *
                                      CONFIG["SEARCH_BATCH_SIZE"])),
                except_on_fail=True)
            utils.complete_submission(source_id)
            return

        utils.update_status(source_id,
                            "ingest_search",
                            "S",
                            except_on_fail=True)
        os.remove(feedstock_file)
        service_res["mdf_search"] = "This dataset was ingested to MDF Search."

    # Move files to data_destinations
    if sub_conf.get("data_destinations"):
        utils.update_status(source_id,
                            "ingest_backup",
                            "P",
                            except_on_fail=True)
        try:
            for backup_res in utils.backup_data(
                    mdf_transfer_client,
                    storage_loc=sub_conf["canon_destination"],
                    backup_locs=sub_conf["data_destinations"],
                    acl=sub_conf["storage_acl"]):
                if not backup_res["success"]:
                    msg = "During data backup: " + backup_res.get(
                        "error", "Unknown error")
                    utils.update_status(source_id,
                                        "ingest_backup",
                                        "T",
                                        text=msg,
                                        except_on_fail=True)
            if not backup_res["success"]:
                raise ValueError(backup_res.get("error"))
        except Exception as e:
            err_msg = "Destination backup failed: {}".format(str(e))
            utils.update_status(source_id,
                                "ingest_backup",
                                "F",
                                text=err_msg,
                                except_on_fail=True)
            return
        # On any complete failure, fail submission
        if not all([val["success"] is True for val in backup_res.values()]):
            err_msg = "; ".join([
                "'{}' failed: {}".format(k, v["error"])
                for k, v in backup_res.items() if v["success"] is not True
            ])
            utils.update_status(source_id,
                                "ingest_backup",
                                "F",
                                text=err_msg,
                                except_on_fail=True)
            return
        # On an error with a successful Transfer, notify user but continue
        elif not all([val["error"] == "" for val in backup_res.values()]):
            err_msg = "; ".join([
                "on '{}': {}".format(k, v["error"])
                for k, v in backup_res.items() if v["error"]
            ])
            utils.update_status(source_id,
                                "ingest_backup",
                                "R",
                                text=err_msg,
                                except_on_fail=True)
        else:
            utils.update_status(source_id,
                                "ingest_backup",
                                "S",
                                except_on_fail=True)
    else:
        utils.update_status(source_id,
                            "ingest_backup",
                            "N",
                            except_on_fail=True)

    # MDF Publish
    if sub_conf["services"].get("mdf_publish"):
        publish_conf = sub_conf["services"]["mdf_publish"]

        # Data already moved to canon dest as a requirement of success so far

        # Mint DOI
        try:
            # Create DOI and add to dataset DC
            dataset["dc"]["identifier"] = {
                "identifier": utils.make_dc_doi(test=publish_conf["doi_test"]),
                "identifierType": "DOI"
            }
            # Add publication dates and publisher
            dataset["dc"]["publisher"] = "Materials Data Facility"
            dataset["dc"]["publicationYear"] = datetime.now().year
            if not dataset["dc"].get("dates"):
                dataset["dc"]["dates"] = []
            dataset["dc"]["dates"].append({
                "date": str(datetime.now().date()),
                "dateType": "Accepted"
            })
            landing_page = CONFIG["DATASET_LANDING_PAGE"].format(source_id)
            mdf_publish_res = utils.datacite_mint_doi(
                dataset["dc"], test=publish_conf["doi_test"], url=landing_page)
        except Exception as e:
            logger.error("DOI minting exception: {}".format(repr(e)))
            utils.update_status(source_id,
                                "ingest_publish",
                                "F",
                                text="DOI minting failed",
                                except_on_fail=True)
            return
        else:
            if not mdf_publish_res["success"]:
                logger.error("DOI minting failed: {}".format(
                    mdf_publish_res["error"]))
                utils.update_status(source_id,
                                    "ingest_publish",
                                    "F",
                                    text="Unable to mint DOI for publication",
                                    except_on_fail=True)
                return

        utils.update_status(
            source_id,
            "ingest_publish",
            "L",
            text=("Dataset published though MDF Publish with DOI '{}'".format(
                dataset["dc"]["identifier"]["identifier"])),
            link=landing_page,
            except_on_fail=True)
        service_res["mdf_publish"] = landing_page

    else:
        utils.update_status(source_id,
                            "ingest_publish",
                            "N",
                            except_on_fail=True)

    # Citrine (skip if not extracted)
    if sub_conf["services"].get("citrine") and not sub_conf.get("no_extract"):
        utils.update_status(source_id,
                            "ingest_citrine",
                            "P",
                            except_on_fail=True)

        # Get old Citrine dataset version, if exists
        scan_res = utils.scan_table(table_name="status",
                                    fields=["source_id", "citrine_id"],
                                    filters=[("source_name", "==",
                                              source_info["source_name"]),
                                             ("citrine_id", "!=", None)])
        if not scan_res["success"]:
            logger.error("Status scan failed: {}".format(scan_res["error"]))
        old_cit_subs = scan_res.get("results", [])
        if len(old_cit_subs) == 0:
            old_citrine_id = None
        elif len(old_cit_subs) == 1:
            old_citrine_id = old_cit_subs[0]["citrine_id"]
        else:
            old_citrine_id = max([sub["citrine_id"] for sub in old_cit_subs])

        try:
            # Check for PIFs to ingest
            cit_path = os.path.join(service_data, "citrine")
            if len(os.listdir(cit_path)) > 0:
                cit_res = utils.citrine_upload(
                    cit_path,
                    CONFIG["CITRINATION_API_KEY"],
                    dataset,
                    old_citrine_id,
                    public=sub_conf["services"]["citrine"].get("public", True))
            else:
                cit_res = {
                    "success": False,
                    "error": "No PIFs were generated from this dataset",
                    "success_count": 0,
                    "failure_count": 0
                }
        except Exception as e:
            utils.update_status(source_id,
                                "ingest_citrine",
                                "R",
                                text=str(e),
                                except_on_fail=True)
        else:
            if not cit_res["success"]:
                if cit_res.get("error"):
                    text = cit_res["error"]
                elif cit_res.get("failure_count"):
                    text = "All {} PIFs failed to upload".format(
                        cit_res["failure_count"])
                elif cit_res.get("failure_count") == 0:
                    text = "No PIFs were found"
                    logger.warning("{}: PIFs not found!".format(source_id))
                else:
                    text = "An error prevented PIF uploading"
                utils.update_status(source_id,
                                    "ingest_citrine",
                                    "R",
                                    text=text,
                                    except_on_fail=True)
            else:
                text = "{}/{} PIFs uploaded successfully".format(
                    cit_res["success_count"],
                    cit_res["success_count"] + cit_res["failure_count"])
                link = CONFIG["CITRINATION_LINK"].format(
                    cit_ds_id=cit_res["cit_ds_id"])
                utils.update_status(source_id,
                                    "ingest_citrine",
                                    "L",
                                    text=text,
                                    link=link,
                                    except_on_fail=True)
                stat_res_2 = utils.modify_status_entry(
                    source_id, {"citrine_id": cit_res["cit_ds_id"]})
                if not stat_res_2["success"]:
                    raise ValueError(str(stat_res_2))
                service_res["citrine"] = link
    else:
        utils.update_status(source_id,
                            "ingest_citrine",
                            "N",
                            except_on_fail=True)

    # MRR
    if sub_conf["services"].get("mrr"):
        utils.update_status(source_id, "ingest_mrr", "P", except_on_fail=True)
        try:
            if (isinstance(sub_conf["services"]["mrr"], dict)
                    and sub_conf["services"]["mrr"].get("test")):
                mrr_title = "TEST_" + dataset["dc"]["titles"][0]["title"]
            else:
                mrr_title = dataset["dc"]["titles"][0]["title"]
            mrr_contributors = ""
            for author in dataset["dc"]["creators"]:
                mrr_contributors += CONFIG["MRR_CONTRIBUTOR"].format(
                    name=(author.get("givenName", "") + " " +
                          author.get("familyName", "")),
                    affiliation=author.get("affiliation", ""))
            mrr_description = ""
            for desc in dataset["dc"].get("descriptions", []):
                mrr_description += desc["description"] + " "
            # Must add at least one subject to MRR entry
            mrr_subjects = "<subject>MDF Dataset</subject>"
            for subj in dataset["dc"].get("subjects", []):
                mrr_subjects += "<subject>" + subj["subject"] + "</subject>"
            mrr_entry = {
                "title":
                dataset["dc"]["titles"][0]["title"],
                "template":
                CONFIG["MRR_SCHEMA"],
                "xml_content":
                CONFIG["MRR_TEMPLATE"].format(
                    title=mrr_title,
                    publisher=dataset["dc"]["publisher"],
                    contributors=mrr_contributors,
                    contact_name=dataset["dc"]["creators"][0]["creatorName"],
                    description=mrr_description,
                    subjects=mrr_subjects,
                    landing_page=CONFIG["DATASET_LANDING_PAGE"].format(
                        source_id))
            }
        except Exception as e:
            utils.update_status(source_id,
                                "ingest_mrr",
                                "R",
                                text="Unable to create MRR metadata:" +
                                repr(e),
                                except_on_fail=True)
        else:
            try:
                mrr_res_raw = requests.post(CONFIG["MRR_URL"],
                                            auth=(CONFIG["MRR_USERNAME"],
                                                  CONFIG["MRR_PASSWORD"]),
                                            data=mrr_entry)
                try:
                    mrr_res = mrr_res_raw.json()
                except json.JSONDecodeError:
                    raise ValueError("Invalid MRR response: {}".format(
                        mrr_res_raw.content))

                if mrr_res_raw.status_code not in [201, 202]:
                    raise ValueError(
                        "MRR ingest failed with error code {}: '{}'".format(
                            mrr_res_raw.status_code, mrr_res))
            except Exception as e:
                utils.update_status(source_id,
                                    "ingest_mrr",
                                    "R",
                                    text="Unable to submit MRR entry: " +
                                    repr(e),
                                    except_on_fail=True)
            else:
                try:
                    mrr_id = mrr_res.get("id")
                    if not mrr_id:
                        raise ValueError("MRR entry has no ID")
                except Exception:
                    utils.update_status(source_id,
                                        "ingest_mrr",
                                        "R",
                                        text=mrr_res.get(
                                            "message", "Unknown MRR failure"),
                                        except_on_fail=True)
                else:
                    text = "Dataset successfully registered with the MRR"
                    mrr_link = CONFIG["MRR_LINK"].format(mrr_id)
                    utils.update_status(source_id,
                                        "ingest_mrr",
                                        "L",
                                        text=text,
                                        link=mrr_link,
                                        except_on_fail=True)
                    service_res["mrr"] = mrr_link
    else:
        utils.update_status(source_id, "ingest_mrr", "N", except_on_fail=True)

    # Dataset update, start cleanup
    utils.update_status(source_id, "ingest_cleanup", "P", except_on_fail=True)

    dataset["services"] = service_res
    ds_update = utils.update_search_entries(search_config.get(
        "index", CONFIG["INGEST_INDEX"]),
                                            entries=[dataset],
                                            overwrite=False)
    if not ds_update["success"]:
        utils.update_status(source_id,
                            "ingest_cleanup",
                            "F",
                            text=ds_update.get("error",
                                               "Unable to update dataset"),
                            except_on_fail=True)
        utils.complete_submission(source_id)
        return

    # Cleanup
    try:
        fin_res = utils.complete_submission(source_id,
                                            cleanup=CONFIG["FINAL_CLEANUP"])
    except Exception as e:
        utils.update_status(source_id,
                            "ingest_cleanup",
                            "F",
                            text=repr(e),
                            except_on_fail=True)
        return
    if not fin_res["success"]:
        utils.update_status(source_id,
                            "ingest_cleanup",
                            "F",
                            text=fin_res["error"],
                            except_on_fail=True)
        return
    utils.update_status(source_id, "ingest_cleanup", "S", except_on_fail=True)

    logger.debug("{}: Ingest complete".format(source_id))
    return {"success": True, "source_id": source_id}
Exemplo n.º 15
0
def getTokens():

    tokens = None
    try:
        # if we already have tokens, load and use them
        tokens = load_tokens_from_file(p.opt["globusTokenFile"])
    except:
        pass

    if not tokens:
        # if we need to get tokens, start the Native App authentication process
        tokens = do_native_app_authentication(CLIENT_ID, REDIRECT_URI, SCOPES)

        try:
            save_tokens_to_file(p.opt["globusTokenFile"], tokens)
        except:
            pass

    transfer_tokens = tokens['transfer.api.globus.org']

    auth_client = globus_sdk.NativeAppAuthClient(client_id=CLIENT_ID)

    authorizer = globus_sdk.RefreshTokenAuthorizer(
        transfer_tokens['refresh_token'],
        auth_client,
        access_token=transfer_tokens['access_token'],
        expires_at=transfer_tokens['expires_at_seconds'],
        on_refresh=update_tokens_file_on_refresh)

    transfer = globus_sdk.TransferClient(authorizer=authorizer)

    myproxy_lifetime = 720  #in hours.  What's the maximum?
    try:
        r = transfer.endpoint_autoactivate(p.opt["archiveEndPoint"],
                                           if_expires_in=3600)
        while (r["code"] == "AutoActivationFailed"):
            print(
                "Endpoint requires manual activation, please use your UCAS name/password for this activation. "
                "You can activate via the command line or via web browser:\n"
                "WEB BROWSER -- Open the following URL in a browser to activate the "
                "endpoint:")
            print(
                f"https://app.globus.org/file-manager?origin_id={p.opt['archiveEndPoint']}"
            )
            print("CMD LINE -- run this from your shell: ")
            print(
                f"globus endpoint activate --myproxy --myproxy-lifetime {myproxy_lifetime} {p.opt['archiveEndPoint']}"
            )
            input("Press ENTER after activating the endpoint:")
            r = tc.endpoint_autoactivate(ep_id, if_expires_in=3600)

    except globus_sdk.exc.GlobusAPIError as ex:
        print("endpoint_autoactivation failed.")
        print(ex)
        if ex.http_status == 401:
            sys.exit('Refresh token has expired. '
                     'Please delete refresh-tokens.json and try again.')
        else:
            raise ex

    # print out a directory listing from an endpoint
    #print("Looking at archive end point")
    #for entry in transfer.operation_ls(p.opt["archiveEndPoint"], path='/~/'):
    #    print(entry['name'] + ('/' if entry['type'] == 'dir' else ''))

    # revoke the access token that was just used to make requests against
    # the Transfer API to demonstrate that the RefreshTokenAuthorizer will
    # automatically get a new one
    #auth_client.oauth2_revoke_token(authorizer.access_token)
    # Allow a little bit of time for the token revocation to settle
    #time.sleep(1)
    # Verify that the access token is no longer valid
    #token_status = auth_client.oauth2_validate_token(
    #    transfer_tokens['access_token'])
    #assert token_status['active'] is False, 'Token was expected to be invalid.'

    #print('\nDoing a second directory listing with a new access token:')
    #for entry in transfer.operation_ls(p.opt["archiveEndPoint"], path='/~/'):
    #    print(entry['name'] + ('/' if entry['type'] == 'dir' else ''))

    local_ep = globus_sdk.LocalGlobusConnectPersonal()
    local_ep_id = local_ep.endpoint_id

    #print("Looking at local end point")
    #for entry in transfer.operation_ls(local_ep_id):
    #    print(f"Local file: {entry['name']}")

    logging.info("BEGINNING PROCESSING OF archiveItems")
    for item, item_info in p.opt["archiveItems"].items():
        logging.info(f"Transferring {item}")
        if not item_info["source"].startswith('/'):
            logging.error(
                f"{item} source: {item_info['source']} must be absolute.  SKIPPING!"
            )
            continue
        if not item_info["destination"].startswith('/'):
            logging.error(
                f"{item} source: {item_info['destination']} must be absolute.  SKIPPING!"
            )
            continue
        try:
            transfer.operation_ls(p.opt["archiveEndPoint"],
                                  path=item_info["destination"])
        except globus_sdk.exc.TransferAPIError as e:
            logging.fatal(
                f"Destination path ({item_info['destination']}) does not exist on archiveEndPoint."
            )
            logging.fatal(e)
            sys.exit(1)

        # get leaf dir from source, and add it to destination
        dirname, leaf = os.path.split(item_info['source'])
        if leaf == '':
            _, leaf = os.path.split(dirname)
        destination_directory = os.path.join(item_info['destination'],
                                             leaf) + '/'

        # Check if destination_dir already exists, and skip if so
        # TODO: add support to overwrite?
        try:
            transfer.operation_ls(p.opt["archiveEndPoint"],
                                  path=destination_directory)
            logging.error(
                f"Destination {destination_directory} already exists on archiveEndPoint.  SKIPPING!"
            )
            continue
        except globus_sdk.exc.TransferAPIError as e:
            if e.code != u'ClientError.NotFound':
                logging.fatal(
                    f"Can't ls {p.opt['archiveEndPoint']} : {destination_directory}"
                )
                logging.fatal(e)
                sys.exit(1)

        # create destination directory
        try:
            logging.info(
                f"Creating destination directory {destination_directory}")
            transfer.operation_mkdir(p.opt["archiveEndPoint"],
                                     destination_directory)
        except globus_sdk.exc.TransferAPIError as e:
            logging.fatal(
                f"Can't mkdir {p.opt['archiveEndPoint']} : {destination_directory}"
            )
            logging.fatal(e)
            sys.exit(1)

        # TODO: set permissions for users to read dir
        #       look at https://github.com/globus/automation-examples/blob/master/share_data.py

        #tdata = globus_sdk.TransferData(transfer, local_ep_id, p.opt["archiveEndPoint"], label=item_info["transfer-label"])
        tdata = globus_sdk.TransferData(transfer, local_ep_id,
                                        p.opt["archiveEndPoint"])
        tdata.add_item(item_info["source"],
                       destination_directory,
                       recursive=True)
        try:
            logging.info(
                f"Submitting transfer task - {item_info['transfer-label']}")
            task = transfer.submit_transfer(tdata)
        except globus_sdk.exc.TransferAPIError as e:
            logging.fatal("Transfer task submission failed")
            logging.fatal(e)
            sys.exit(1)
        logging.info(f"Task ID: {task['task_id']}")
        logging.info(
            f"This task can be monitored via the Web UI: https://app.globus.org/activity/{task['task_id']}"
        )