Пример #1
0
def map_workload(target_data):
    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    dbms = newest_result.dbms.pk
    hardware = newest_result.application.hardware.pk
    workload_data = PipelineResult.get_latest(
        dbms, hardware, PipelineTaskType.WORKLOAD_MAPPING_DATA)
    if workload_data is None:
        target_data['scores'] = None
        return target_data

    data_values = JSONUtil.loads(workload_data.value)
    X_scaler = np.load(data_values['X_scaler'])
    y_scaler = np.load(data_values['y_scaler'])
    y_deciles = np.load(data_values['y_deciles'])['deciles']
    X_columnlabels = data_values['X_columnlabels']
    y_columnlabels = data_values['y_columnlabels']

    X_idxs = [
        i for i in range(target_data['X_matrix'].shape[1])
        if target_data['X_columnlabels'][i] in X_columnlabels
    ]
    y_idxs = [
        i for i in range(target_data['y_matrix'].shape[1])
        if target_data['y_columnlabels'][i] in y_columnlabels
    ]
    X_target = target_data['X_matrix'][:, X_idxs]
    y_target = target_data['y_matrix'][:, y_idxs]
    X_target = (X_target - X_scaler['mean']) / X_scaler['scale']
    y_target = (y_target - y_scaler['mean']) / y_scaler['scale']
    y_binned = np.empty_like(y_target)
    for i in range(y_target.shape[1]):
        y_binned[:, i] = bin_by_decile(y_target[:, i], y_deciles[i])

    scores = {}
    for wkld_id, wkld_entry_path in data_values['data'].iteritems():
        wkld_entry = np.load(wkld_entry_path)
        preds = np.empty_like(y_target)
        X_wkld = wkld_entry['X_matrix']
        for j in range(y_target.shape[1]):
            y_col = wkld_entry['y_matrix'][:, j].reshape(X_wkld.shape[0], 1)
            model = GPR()
            model.fit(X_wkld, y_col, ridge=0.01)
            preds[:, j] = bin_by_decile(
                model.predict(X_target).ypreds.ravel(), y_deciles[j])
        dists = np.sqrt(np.sum(np.square(np.subtract(preds, y_target)),
                               axis=1))
        scores[wkld_id] = np.mean(dists)

    # Find the best (minimum) score
    best_score = np.inf
    best_wkld_id = None
    for wkld_id, similarity_score in scores.iteritems():
        if similarity_score < best_score:
            best_score = similarity_score
            best_wkld_id = wkld_id
    target_data['mapped_workload'] = (best_wkld_id, best_score)
    target_data['scores'] = scores
    return target_data
Пример #2
0
def map_workload(target_data):
    # Get the latest version of pipeline data that's been computed so far.
    latest_pipeline_run = PipelineRun.objects.get_latest()
    assert latest_pipeline_run is not None

    newest_result = Result.objects.get(pk=target_data['newest_result_id'])
    target_workload = newest_result.workload
    X_columnlabels = np.array(target_data['X_columnlabels'])
    y_columnlabels = np.array(target_data['y_columnlabels'])

    # Find all pipeline data belonging to the latest version with the same
    # DBMS and hardware as the target
    pipeline_data = PipelineData.objects.filter(
        pipeline_run=latest_pipeline_run,
        workload__dbms=target_workload.dbms,
        workload__hardware=target_workload.hardware)

    # FIXME (dva): we should also compute the global (i.e., overall) ranked_knobs
    # and pruned metrics but we just use those from the first workload for now
    initialized = False
    global_ranked_knobs = None
    global_pruned_metrics = None
    ranked_knob_idxs = None
    pruned_metric_idxs = None

    # Compute workload mapping data for each unique workload
    unique_workloads = pipeline_data.values_list('workload',
                                                 flat=True).distinct()
    assert len(unique_workloads) > 0
    workload_data = {}
    for unique_workload in unique_workloads:
        # Load knob & metric data for this workload
        knob_data = load_data_helper(pipeline_data, unique_workload,
                                     PipelineTaskType.KNOB_DATA)
        metric_data = load_data_helper(pipeline_data, unique_workload,
                                       PipelineTaskType.METRIC_DATA)
        X_matrix = np.array(knob_data["data"])
        y_matrix = np.array(metric_data["data"])
        rowlabels = np.array(knob_data["rowlabels"])
        assert np.array_equal(rowlabels, metric_data["rowlabels"])

        if not initialized:
            # For now set ranked knobs & pruned metrics to be those computed
            # for the first workload
            global_ranked_knobs = load_data_helper(
                pipeline_data, unique_workload,
                PipelineTaskType.RANKED_KNOBS)[:10]  # FIXME (dva)
            global_pruned_metrics = load_data_helper(
                pipeline_data, unique_workload,
                PipelineTaskType.PRUNED_METRICS)
            ranked_knob_idxs = [
                i for i in range(X_matrix.shape[1])
                if X_columnlabels[i] in global_ranked_knobs
            ]
            pruned_metric_idxs = [
                i for i in range(y_matrix.shape[1])
                if y_columnlabels[i] in global_pruned_metrics
            ]

            # Filter X & y columnlabels by top 10 ranked_knobs & pruned_metrics
            X_columnlabels = X_columnlabels[ranked_knob_idxs]
            y_columnlabels = y_columnlabels[pruned_metric_idxs]
            initialized = True

        # Filter X & y matrices by top 10 ranked_knobs & pruned_metrics
        X_matrix = X_matrix[:, ranked_knob_idxs]
        y_matrix = y_matrix[:, pruned_metric_idxs]

        # Combine duplicate rows (rows with same knob settings)
        X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows(
            X_matrix, y_matrix, rowlabels)

        workload_data[unique_workload] = {
            'X_matrix': X_matrix,
            'y_matrix': y_matrix,
            'rowlabels': rowlabels,
        }

    # Stack all X & y matrices for preprocessing
    Xs = np.vstack([entry['X_matrix'] for entry in workload_data.values()])
    ys = np.vstack([entry['y_matrix'] for entry in workload_data.values()])

    # Scale the X & y values, then compute the deciles for each column in y
    X_scaler = StandardScaler(copy=False)
    X_scaler.fit(Xs)
    y_scaler = StandardScaler(copy=False)
    y_scaler.fit_transform(ys)
    y_binner = Bin(bin_start=1, axis=0)
    y_binner.fit(ys)
    del Xs
    del ys

    # Filter the target's X & y data by the ranked knobs & pruned metrics.
    X_target = target_data['X_matrix'][:, ranked_knob_idxs]
    y_target = target_data['y_matrix'][:, pruned_metric_idxs]

    # Now standardize the target's data and bin it by the deciles we just
    # calculated
    X_target = X_scaler.transform(X_target)
    y_target = y_scaler.transform(y_target)
    y_target = y_binner.transform(y_target)

    scores = {}
    for workload_id, workload_entry in workload_data.iteritems():
        predictions = np.empty_like(y_target)
        X_workload = workload_entry['X_matrix']
        y_workload = workload_entry['y_matrix']
        for j, y_col in enumerate(y_workload.T):
            # Using this workload's data, train a Gaussian process model
            # and then predict the performance of each metric for each of
            # the knob configurations attempted so far by the target.
            y_col = y_col.reshape(-1, 1)
            model = GPR()
            model.fit(X_workload, y_col, ridge=0.01)
            predictions[:, j] = model.predict(X_target).ypreds.ravel()
        # Bin each of the predicted metric columns by deciles and then
        # compute the score (i.e., distance) between the target workload
        # and each of the known workloads
        predictions = y_binner.transform(predictions)
        dists = np.sqrt(
            np.sum(np.square(np.subtract(predictions, y_target)), axis=1))
        scores[workload_id] = np.mean(dists)

    # Find the best (minimum) score
    best_score = np.inf
    best_workload_id = None
    for workload_id, similarity_score in scores.iteritems():
        if similarity_score < best_score:
            best_score = similarity_score
            best_workload_id = workload_id
    target_data['mapped_workload'] = (best_workload_id, best_score)
    target_data['scores'] = scores
    return target_data