示例#1
0
def _load_hparams(hparams_file):
    """Loads the HParams configuration from a hparams file of a trial.
    """
    hparams_file_contents = hopshdfs.load(hparams_file)
    hparams = json.loads(hparams_file_contents)

    return hparams
示例#2
0
    def read_featureframe(self, spark):
        """
        Reads a training dataset in hdf5 format from HopsFS

        Args:
            :spark: the spark session

        Returns:
            dataframe with the data of the training dataset

        Raises:
              :TrainingDatasetNotFound: if the requested training dataset could not be found
              :CouldNotConvertDataframe: if the hdf5 dataset could not be converted to a spark dataframe
              :HDF5DatasetFormatNotSupportedForExternalTrainingDatasets: if the user tries to read an
                                                                          external training dataset in the .hdf5 format.
        """
        if not hasattr(self, 'training_dataset') or \
                        self.training_dataset.training_dataset_type \
                        == constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            raise HDF5DatasetFormatNotSupportedForExternalTrainingDatasets(
                "The .hdf5 dataset format is not "
                "supported for external training datasets.")
        if not hdfs.exists(
                self.path +
                constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX):
            raise TrainingDatasetNotFound(
                "Could not find a training dataset in file {}".format(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX))
        tf = TemporaryFile()
        data = hdfs.load(self.path +
                         constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX)
        tf.write(data)
        tf.seek(0)
        hdf5_file = h5py.File(tf)
        np_array = hdf5_file[self.training_dataset.name][()]
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_NUMPY:
            return np_array
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PYTHON:
            return np_array.tolist()
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_SPARK \
                or self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PANDAS:
            if np_array.ndim != 2:
                raise CouldNotConvertDataframe(
                    "Cannot convert numpy array that do not have two dimensions to a dataframe. "
                    "The number of dimensions are: {}".format(np_array.ndim))
            num_cols = np_array.shape[1]
            dataframe_dict = {}
            for n_col in list(range(num_cols)):
                col_name = "col_" + str(n_col)
                dataframe_dict[col_name] = np_array[:, n_col]
            pandas_df = pd.DataFrame(dataframe_dict)
            sc = spark.sparkContext
            sql_context = SQLContext(sc)
            return fs_utils._return_dataframe_type(
                sql_context.createDataFrame(pandas_df), self.dataframe_type)
示例#3
0
def parse_input_json(hdfs_path):
    """
    Parse input JSON command line arguments for the util job

    Args:
        :hdfs_path: path to the JSON input on HDFS

    Returns:
        The parsed JSON (dict)
    """
    return json.loads(hdfs.load(hdfs_path))
示例#4
0
def _convert_return_file_to_arr(return_file_path):
    return_file_contents = hdfs.load(return_file_path)

    # Could be a number
    try:
        metric = int(return_file_contents)
        return [{'metric': metric}]
    except:
        pass

    return_json = json.loads(return_file_contents)
    metric_dict = {}
    for metric_key in return_json:
        metric_dict[metric_key] = return_json[metric_key]
    return metric_dict
示例#5
0
def _get_params_dict(best_dir):
    """
    Utiliy method for converting best_param string to dict

    Args:
        :best_param: the best_param string

    Returns:
        a dict with param->value

    """

    params_json = hdfs.load(best_dir + '/.hparams.json')
    params_dict = json.loads(params_json)
    return params_dict
示例#6
0
def _get_best(root_logdir, direction):

    min_val = sys.float_info.max
    min_logdir = None

    max_val = sys.float_info.min
    max_logdir = None

    generation_folders = hdfs.ls(root_logdir)
    generation_folders.sort()

    for generation in generation_folders:
        for individual in hdfs.ls(generation):
            invidual_files = hdfs.ls(individual, recursive=True)
            for file in invidual_files:
                if file.endswith("/.metric"):
                    val = hdfs.load(file)
                    val = float(val)

                    if val > max_val:
                        max_val = val
                        max_logdir = file[:-8]

                    if val < min_val:
                        min_val = val
                        min_logdir = file[:-8]



    if direction.upper() == Direction.MAX:
        return_dict = {}
        with hdfs.open_file(max_logdir + '/.outputs.json', flags="r") as fi:
            return_dict = json.loads(fi.read())
            fi.close()
        return max_logdir, return_dict
    else:
        return_dict = {}
        with hdfs.open_file(min_logdir + '/.outputs.json', flags="r") as fi:
            return_dict = json.loads(fi.read())
            fi.close()
        return min_logdir, return_dict
示例#7
0
def _get_return_file(param_string, app_id, generation_id, run_id):
    """

    Args:
        :param_string:
        :app_id:
        :generation_id:
        :run_id:

    Returns:

    """
    handle = hdfs.get()
    for i in range(generation_id):
        possible_result_path = experiment_utils._get_experiments_dir() + '/' + app_id + '_' \
                               + str(run_id) + '/generation.' + str(i) + '/' + param_string + '/.outputs.json'
        if handle.exists(possible_result_path):
            return_file_contents = hdfs.load(possible_result_path)
            return return_file_contents

    return None
示例#8
0
def _convert_param_to_arr(params_file):
    params = hdfs.load(params_file)
    params_dict = json.loads(params)
    return params_dict
示例#9
0
 def load(self, hparams_file):
     return hopshdfs.load(hparams_file)
示例#10
0
def _run(sc,
         map_fun,
         run_id,
         args_dict=None,
         local_logdir=False,
         name="no-name"):
    """

    Args:
        sc:
        map_fun:
        args_dict:
        local_logdir:
        name:

    Returns:

    """

    app_id = str(sc.applicationId)

    if args_dict == None:
        num_executions = 1
    else:
        arg_lists = list(args_dict.values())
        currentLen = len(arg_lists[0])
        for i in range(len(arg_lists)):
            if currentLen != len(arg_lists[i]):
                raise ValueError(
                    'Length of each function argument list must be equal')
            num_executions = len(arg_lists[i])

    sc.setJobGroup(os.environ['ML_ID'],
                   "{} | Launcher running experiment".format(name))
    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Force execution on executor, since GPU is located on executor
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir))

    print('Finished Experiment \n')

    # For single run return .return if exists
    if args_dict == None:
        path_to_return = experiment_utils._get_logdir(
            app_id, run_id) + '/.outputs.json'
        if hdfs.exists(path_to_return):
            return_json = hdfs.load(path_to_return)
            return_dict = json.loads(return_json)
            return experiment_utils._get_logdir(app_id, run_id), return_dict
        else:
            return experiment_utils._get_logdir(app_id, run_id), None
    elif num_executions == 1:
        arg_count = six.get_function_code(map_fun).co_argcount
        arg_names = six.get_function_code(map_fun).co_varnames
        argIndex = 0
        param_string = ''
        while arg_count > 0:
            param_name = arg_names[argIndex]
            param_val = args_dict[param_name][0]
            param_string += str(param_name) + '=' + str(param_val) + '&'
            arg_count -= 1
            argIndex += 1
        param_string = param_string[:-1]
        path_to_return = experiment_utils._get_logdir(
            app_id, run_id) + '/' + param_string + '/.outputs.json'
        if hdfs.exists(path_to_return):
            return_json = hdfs.load(path_to_return)
            return_dict = json.loads(return_json)
            return experiment_utils._get_logdir(app_id, run_id), return_dict
        else:
            return experiment_utils._get_logdir(app_id, run_id), None
    else:
        return experiment_utils._get_logdir(app_id, run_id), None