def _copy_predictive_object_files(source_path, target_path, is_dir, src_credentials, tgt_credentials): ''' Copy either file or folder from source location to target location ''' # Cleanup existing file path if exists if _file_util.is_local_path(target_path) and _os.path.exists(target_path): _shutil.rmtree(target_path) if _file_util.is_s3_path(source_path) and _file_util.is_s3_path(target_path): # compare credentials _check_aws_credentials(src_credentials, tgt_credentials, source_path) # intra s3 copy model _file_util.intra_s3_copy_model(source_path, target_path, is_dir, tgt_credentials) elif _file_util.is_local_path(source_path): _file_util.copy_from_local(source_path, target_path, is_dir = is_dir) else: tmp_dir = _tempfile.mkdtemp(prefix = 'copy_predictive_object') try: # download to local first local_path = _os.path.join(tmp_dir, 'temp_po_file') if _file_util.is_s3_path(source_path): _file_util.download_from_s3( source_path, local_path, is_dir=is_dir, aws_credentials=src_credentials, silent=False) elif _file_util.is_hdfs_path(source_path): _file_util.download_from_hdfs(source_path, local_path, is_dir = False) else: raise RuntimeError('Unsupported file system type: %s' % source_path) # upload from local to remote if _file_util.is_s3_path(target_path): _file_util.upload_to_s3(local_path, target_path, is_dir=is_dir, aws_credentials=tgt_credentials, silent=False) elif _file_util.is_hdfs_path(target_path): _file_util.hdfs_mkdir(target_path) _file_util.upload_to_hdfs(local_path, target_path, force=True, silent=False) else: _file_util.upload_to_local(local_path, target_path, is_dir=is_dir, silent=False) finally: _shutil.rmtree(tmp_dir)
def _test_url(self,file_path): if _file_util.is_hdfs_path(file_path): return _file_util.hdfs_test_url(file_path,'e',self.environment.hadoop_conf_dir) if _file_util.is_s3_path(file_path): return _file_util.s3_test_url(file_path,self.environment.ec2_config.get_credentials()) else: return _os.path.exists(file_path)
def _deserialize_output(self, task): """ Deserialize the output from a task. Parameters ---------- Task definition of interest. Returns ------- The output of the run-time task associated with the task definition. """ filepath = self._task_output_paths[task] non_hdfs_file_path = filepath # Unpickler has no support for passing in additional HADOOP_CONF_DIR # so we download HDFS folder first before calling to unpickler if _file_util.is_hdfs_path(filepath): non_hdfs_file_path = _make_temp_directory("job_output_") _file_util.download_from_hdfs(filepath, non_hdfs_file_path, hadoop_conf_dir=self.environment.hadoop_conf_dir, is_dir = True) unpickler = gl_pickle.GLUnpickler(non_hdfs_file_path) # We cannot delete this temporary file path becaue SFrame lazily load # the content from disk. But the temporary folder will be removed # eventually when the python session goes away return unpickler.load()
def _read_commander_init_status_file(self): commander_file_path = self._get_commander_file_path() local_file_name = _tempfile.mktemp(prefix='dml_file_') try: if _file_util.is_hdfs_path(commander_file_path): _file_util.download_from_hdfs( commander_file_path, local_file_name, hadoop_conf_dir = self.environment.hadoop_conf_dir) elif _file_util.is_s3_path(commander_file_path): _file_util.download_from_s3( commander_file_path, local_file_name, aws_credentials = self.environment.get_credentials(), silent = True) with open(local_file_name,'r') as f: status_json = _json.load(f) port = status_json['port'] host_name = status_json['host_name'] if port > 0: return 'http://%s:%s' % (host_name, port) else: return None except: # Ignore exception, we will fail after a few retry pass finally: if _os.path.exists(local_file_name): _os.remove(local_file_name)
def __get_log_file_content(self, url, handler): """ Get and return the log file content """ log_file_path = str(handler.get_argument("log", None)) job = self.__load_job() content = "" max_size = long(1048576) # max size is 1mb status_code = 200 if log_file_path: try: if _file_util.is_local_path(log_file_path): if _os.path.getsize(log_file_path) > max_size: raise RuntimeError( "Cannot read file greater than max size.") else: content = self.__load_local_log_file(log_file_path) elif _file_util.is_s3_path(log_file_path): content = _file_util.read_file_to_string_s3( log_file_path, max_size, job.environment.get_credentials()) elif _file_util.is_hdfs_path(log_file_path): content = _file_util.read_file_to_string_hdfs( log_file_path, max_size, job.environment.hadoop_conf_dir) else: status_code = 404 content = "Log file path (%s) is not valid." % log_file_path except RuntimeError: status_code = 413 content = "File size too large. Please load log file manually at %s." % log_file_path handler.set_status(status_code) handler.set_header("Content-Type", "text/plain") handler.write(content)
def __get_log_file_path_list(self, url, handler): """ Returns a list of log file path for this job """ job = self.__load_job() # get the directory that contains all the logs log_file_path = job.get_log_file_path() path_list = [] # list the directory to get the full path to each log if _file_util.is_s3_path(log_file_path): ec2_log_list = _file_util.list_s3( log_file_path, job.environment.get_credentials()) if ec2_log_list and len(ec2_log_list) > 0: path_list.extend([log['path'] for log in ec2_log_list]) elif _file_util.is_hdfs_path(log_file_path): hdfs_log_list = _file_util.list_hdfs( log_file_path, job.environment.hadoop_conf_dir) if hdfs_log_list and len(hdfs_log_list) > 0: path_list.extend([log['path'] for log in hdfs_log_list]) else: path_list.append(log_file_path) handler.write({'log_file_list': path_list})
def __get_log_file_path_list(self, url, handler): """ Returns a list of log file path for this job """ job = self.__load_job() # get the directory that contains all the logs log_file_path = job.get_log_file_path() path_list = [] # list the directory to get the full path to each log if _file_util.is_s3_path(log_file_path): ec2_log_list = _file_util.list_s3(log_file_path, job.environment.get_credentials()) if ec2_log_list and len(ec2_log_list) > 0: path_list.extend([log['path'] for log in ec2_log_list]) elif _file_util.is_hdfs_path(log_file_path): hdfs_log_list = _file_util.list_hdfs(log_file_path, job.environment.hadoop_conf_dir) if hdfs_log_list and len(hdfs_log_list) > 0: path_list.extend([log['path'] for log in hdfs_log_list]) else: path_list.append(log_file_path) handler.write({'log_file_list': path_list})
def __get_log_file_content(self, url, handler): """ Get and return the log file content """ log_file_path = str(handler.get_argument("log", None)) job = self.__load_job() content = "" max_size = 1048576L # max size is 1mb status_code = 200 if log_file_path: try: if _file_util.is_local_path(log_file_path): if _os.path.getsize(log_file_path) > max_size: raise RuntimeError("Cannot read file greater than max size.") else: content = self.__load_local_log_file(log_file_path) elif _file_util.is_s3_path(log_file_path): content = _file_util.read_file_to_string_s3(log_file_path, max_size, job.environment.get_credentials()) elif _file_util.is_hdfs_path(log_file_path): content = _file_util.read_file_to_string_hdfs(log_file_path, max_size, job.environment.hadoop_conf_dir) else: status_code = 404 content = "Log file path (%s) is not valid." % log_file_path except RuntimeError: status_code = 413 content = "File size too large. Please load log file manually at %s." % log_file_path handler.set_status(status_code) handler.set_header("Content-Type", "text/plain") handler.write(content)
def save(self, path, aws_credentials = {}): """ Save predictive object to the given path Parameters ---------- path : str The location to save the predictive object to """ # only support saving to local or S3 for now if (not (fu.is_s3_path(path) or \ fu.is_local_path(path) or \ fu.is_hdfs_path(path))): raise RuntimeError("Only save to local and S3 path is supported, cannot \ save predictive object to path %s. " % path) if fu.is_local_path(path) and os.path.exists(path): if os.path.exists(path): logging.warning("Overwriting existing file '%s' when saving predictive object" % path) rm_fn = os.remove if os.path.isfile(path) else shutil.rmtree rm_fn(path) if fu.is_local_path(path): self._save_local(path) else: self._save_remote(path, aws_credentials) tracker = _mt._get_metric_tracker() tracker.track('deploy.predictive_service.predictive_object', value=1, properties={ 'type': self.__class__.__name__ } )
def _load_remote(cls, path, schema_version, aws_credentials={}): temp_dir = _tempfie.mkdtemp(prefix='predictive_object_') if fu.is_s3_path(path): fu.download_from_s3(path, temp_dir, is_dir=(schema_version > 2), aws_credentials=aws_credentials) elif fu.is_hdfs_path(path): fu.download_from_hdfs(path, temp_dir, is_dir=(schema_version > 2)) else: assert 'Only support S3 and HDFS path for Predictive Object saving location!' return cls._load_local(temp_dir)
def _load_remote(cls, path, schema_version, aws_credentials={}): temp_dir = _gl.util._make_temp_filename(prefix='predictive_policy_') if _file_util.is_s3_path(path): _file_util.download_from_s3(path, temp_dir, is_dir=True, aws_credentials=aws_credentials, silent=True) elif _file_util.is_hdfs_path(path): _file_util.download_from_hdfs(path, temp_dir, is_dir=True) else: assert 'Only support S3 and HDFS path for Predictive Object saving location!' return cls._load_local(temp_dir)
def _upload_folder_to_remote(self, local, remote): if _file_util.is_s3_path(remote): _file_util.upload_to_s3( local, remote, is_dir = True, aws_credentials = self.environment.get_credentials(), silent = True) elif _file_util.is_hdfs_path(remote): _file_util.upload_folder_to_hdfs( local, remote, self.environment.hadoop_conf_dir)
def _save_remote(self, path, aws_credentials): '''Save current predictive object to S3 ''' tempdir = _tempfie.mkdtemp(prefix='predictive_object_') try: self._save_local(tempdir) if fu.is_s3_path(path): fu.upload_to_s3(tempdir, path, is_dir=True, aws_credentials = aws_credentials) elif fu.is_hdfs_path(path): fu.hdfs_mkdir(path) fu.upload_to_hdfs(tempdir + '/*', path) finally: shutil.rmtree(tempdir)
def _save_remote(self, path, aws_credentials): tempdir = _gl.util._make_temp_filename(prefix='predictive_policy_') try: self._save_local(tempdir) if _file_util.is_s3_path(path): _file_util.upload_to_s3(tempdir, path, is_dir=True, \ aws_credentials = aws_credentials) elif _file_util.is_hdfs_path(path): _file_util.hdfs_mkdir(path) _file_util.upload_to_hdfs(tempdir + '/*', path) finally: _shutil.rmtree(tempdir)
def _load_file_and_parse(self, file_name, parser_func, silent=False, test_url=True): ''' Read remote file to a local temporary file, and use parser_func to parse the content, returns the parsed result. This function is used for parsing state and progress files from either local, S3 or HDFS. If there is any exception happened, returns None ''' file_is_local = _file_util.is_local_path(file_name) local_file_name = file_name if file_is_local else _tempfile.mktemp(prefix='job-status-') try: try: if test_url and not self._test_url(file_name): if not silent: __LOGGER__.info("File %s is not available yet." % file_name) return None if _file_util.is_hdfs_path(file_name): _file_util.download_from_hdfs( hdfs_path = file_name, local_path = local_file_name, hadoop_conf_dir=self.environment.hadoop_conf_dir) elif _file_util.is_s3_path(file_name): _file_util.download_from_s3( s3_path = file_name, local_path = local_file_name, is_dir = False, aws_credentials = self.environment.ec2_config.get_credentials(), silent = silent) except Exception as e: # It is ok the status file is not ready yet as the job is getting prepared if not silent: __LOGGER__.warning("Exception encountered when trying to download file from %s, error: %s" % (file_name, e)) return None try: # parse the local file return parser_func(local_file_name) except Exception as e: __LOGGER__.info("Exception when parsing file %s. Error: %s" % (file_name, e)) return None finally: if (not file_is_local) and _os.path.exists(local_file_name): _os.remove(local_file_name)
def _download_remote_folder_to_local(self, remote_path, silent=False): ''' Download all files from remote path to local. Caller is responsible for cleaning up the local folder after finishing usage Returns the local temporary folder ''' local_path = _tempfile.mkdtemp(prefix='job-results') try: if _file_util.is_hdfs_path(remote_path): _file_util.download_from_hdfs( hdfs_path = remote_path, local_path = local_path, is_dir = True, hadoop_conf_dir=self.environment.hadoop_conf_dir) elif _file_util.is_s3_path(remote_path): _file_util.download_from_s3( s3_path = remote_path, local_path = local_path, is_dir = True, aws_credentials = self.environment.ec2_config.get_credentials(), silent = silent) else: raise RuntimeError("'%s' is not a supported remote path. Only S3 and HDFS" " remote path are supported" % remote_path) except: # Make sure we cleanup local files if we cannot successfully # download files if _os.path.isdir(local_path): _shutil.rmtree(local_path) raise return local_path
def _load_imp(state_path, aws_access_key_id, aws_secret_access_key): ''' Internal implmentation of the load, used by both external facing load and by internal facing load (gl.deploy.predictive_services[name]) ''' aws_credentials = None if _file_util.is_s3_path(state_path): # Save the credentials. if bool(aws_access_key_id) != bool(aws_secret_access_key): raise IOError('Either both aws_access_key_id and aws_secret_access_key ' \ 'should be specified or neither should be specified.') if not aws_access_key_id and not aws_secret_access_key: try: aws_access_key_id, aws_secret_access_key = _get_credentials() except: raise IOError('No AWS credentials set. Credentials must either be ' \ 'passed in, or set globally using ' \ 'graphlab.aws.set_credentials(...).') aws_credentials = { 'aws_access_key_id': aws_access_key_id, 'aws_secret_access_key': aws_secret_access_key } elif (not _file_util.is_hdfs_path(state_path)) and (not _file_util.is_local_path(state_path)): raise ValueError("Invalid state path. Predictive Service only supports loading \ state path from S3, HDFS or Local file path.") config = _PredictiveServiceEnvironment._get_state_from_file(state_path, aws_credentials) name = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, 'Name') description = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, 'Description') api_key = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, 'API Key') admin_key = config.get(_PredictiveService._ENVIRONMENT_SECTION_NAME, 'admin_key') # For backwards compatibility. Port used to be hard-coded as 9005 and does not # exist in the config. if (config.has_option(_PredictiveService._ENVIRONMENT_SECTION_NAME, 'port')): port = int(config.get(_PredictiveService._ENVIRONMENT_SECTION_NAME, 'port')) else: port = _PORT_DEFAULT_NUM global_cache_state = 'enabled' if _CACHE_STATE_SECTION_NAME_ in config.options(_PredictiveService._SERVICE_INFO_SECTION_NAME): global_cache_state = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, _CACHE_STATE_SECTION_NAME_) cors_origin = '' if _CORS_ORIGIN_SECTION_NAME_ in config.options(_PredictiveService._SERVICE_INFO_SECTION_NAME): cors_origin = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, _CORS_ORIGIN_SECTION_NAME_) system_config = _SystemConfig.from_config_parser( config, _PredictiveService._SYSTEM_SECTION_NAME) result = _PredictiveService(name, state_path, description, api_key, admin_key, aws_credentials, _new_service=False, cors_origin=cors_origin, global_cache_state=global_cache_state, system_config=system_config, port = port) # create environment environment_info = dict(config.items(_PredictiveService._ENVIRONMENT_SECTION_NAME)) if aws_credentials: environment_info['aws_credentials'] = aws_credentials result._environment = _predictive_service_environment_factory(environment_info) # get latest state result._get_latest_state() return result