def _try_inject_s3_credentials(url): """ Inject aws credentials into s3 url as s3://[aws_id]:[aws_key]:[bucket/][objectkey] If s3 url already contains secret key/id pairs, just return as is. """ assert url.startswith('s3://') path = url[5:] # Check if the path already contains credentials tokens = path.split(':') # If there are two ':', its possible that we have already injected credentials if len(tokens) == 3: # Edge case: there are exactly two ':'s in the object key which is a false alarm. # We prevent this by checking that '/' is not in the assumed key and id. if ('/' not in tokens[0]) and ('/' not in tokens[1]): return url # S3 url does not contain secret key/id pair, query the environment variables (k, v) = _get_aws_credentials() return 's3://' + k + ':' + v + ':' + path
def make_internal_url(url): """ Takes a user input url string and translates into url relative to the server process. - URL to a local location begins with "local://" or has no "*://" modifier. If the server is local, returns the absolute path of the url. For example: "local:///tmp/foo" -> "/tmp/foo" and "./foo" -> os.path.abspath("./foo"). If the server is not local, raise NotImplementedError. - URL to a server location begins with "remote://". Returns the absolute path after the "remote://" modifier. For example: "remote:///tmp/foo" -> "/tmp/foo". - URL to a s3 location begins with "s3://": Returns the s3 URL with credentials filled in using graphlab.aws.get_aws_credential(). For example: "s3://mybucket/foo" -> "s3://$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY:mybucket/foo". - URL to other remote locations, e.g. http://, will remain as is. - Expands ~ to $HOME Parameters ---------- string A URL (as described above). Raises ------ ValueError If a bad url is provided. """ if not url: raise ValueError('Invalid url: %s' % url) # The final file path on server. path_on_server = None # Try to split the url into (protocol, path). urlsplit = url.split("://") if len(urlsplit) == 2: protocol, path = urlsplit if not path: raise ValueError('Invalid url: %s' % url) if protocol in ['http', 'https']: # protocol is a remote url not on server, just return return url elif protocol == 'hdfs': if isinstance(_glconnect.get_server(), _server.LocalServer) and not _server._get_hadoop_class_path(): raise ValueError("HDFS URL is not supported because Hadoop not found. Please make hadoop available from PATH or set the environment variable HADOOP_HOME and try again.") else: return url elif protocol == 's3': if len(path.split(":")) == 3: # s3 url already contains secret key/id pairs, just return return url else: # s3 url does not contain secret key/id pair, query the environment variables (k, v) = _get_aws_credentials() return 's3://' + k + ':' + v + ':' + path elif protocol == 'remote': # url for files on the server path_on_server = path elif protocol == 'local': # url for files on local client, check if we are connecting to local server if (isinstance(_glconnect.get_server(), _server.LocalServer)): path_on_server = path else: raise ValueError('Cannot use local URL when connecting to a remote server.') else: raise ValueError('Invalid url protocol %s. Supported url protocols are: remote://, local://, s3://, https:// and hdfs://' % protocol) elif len(urlsplit) == 1: # expand ~ to $HOME url = _os.path.expanduser(url) # url for files on local client, check if we are connecting to local server if (isinstance(_glconnect.get_server(), _server.LocalServer)): path_on_server = url else: raise ValueError('Cannot use local URL when connecting to a remote server.') else: raise ValueError('Invalid url: %s' % url) if path_on_server: return _os.path.abspath(_os.path.expanduser(path_on_server)) else: raise ValueError('Invalid url: %s' % url)