Exemplo n.º 1
0
def explain(shap_exp: Explanation, training_df, test_df, explanation_target):
    job = shap_exp.job
    job
    model = joblib.load(job.predictive_model.model_path)
    model = model[0]
    shap.initjs()

    explainer = shap.TreeExplainer(model)
    merged_df = pd.concat([training_df, test_df])
    shap_values = explainer.shap_values(merged_df.drop(['trace_id', 'label'], 1))

    encoder = retrieve_proper_encoder(job)
    encoder.decode(merged_df, job.encoding)
    encoder.decode(test_df, job.encoding)

    explanation_target_int = merged_df[merged_df['trace_id'] == explanation_target].index.item() + \
                             training_df.drop(['trace_id', 'label'], 1).shape[0]

    explanation_target_vector = test_df[test_df['trace_id'] == explanation_target].drop(['trace_id', 'label'], 1)
    expected_value = explainer.expected_value[0] if explainer.expected_value.size > 1 else explainer.expected_value
    shap_value = shap_values[explanation_target_int, :] if hasattr(shap_values,"size") else shap_values[0][
                                                                                          explanation_target_int, :]
    shap.force_plot(expected_value, shap_value, explanation_target_vector,
                    show=False, matplotlib=True).savefig("temporal_shap.svg")
    f = open("temporal_shap.svg", "r")
    response = f.read()
    os.remove("temporal_shap.svg")
    return response
Exemplo n.º 2
0
def explain(shap_exp: Explanation, training_df, test_df, explanation_target,
            prefix_target):
    job = shap_exp.job
    model = joblib.load(job.predictive_model.model_path)
    model = model[0]
    prefix_int = int(prefix_target.strip('/').split('_')[1]) - 1

    explainer = _init_explainer(model)
    target_df = test_df[test_df['trace_id'] ==
                        explanation_target].iloc[prefix_int]
    #if explanation_target is None:
    #    shap_values = explainer.shap_values(test_df.drop(['trace_id', 'label'], 1))
    #else:
    #    shap_values = explainer.shap_values(target_df.drop(['trace_id', 'label'], 0))

    shap_values = _get_explanation(explainer,
                                   target_df.drop(['trace_id', 'label'], 0))

    encoder = retrieve_proper_encoder(job)
    encoder.decode(test_df, job.encoding)
    target_df = test_df[test_df['trace_id'] ==
                        explanation_target].iloc[prefix_int]
    response = {
        explanation_target:
        [(target_df.keys()[index + 1] + ' = ' +
          target_df[target_df.keys()[index + 1]], shap_values[1][index])
         for index in range(len(shap_values[1]))]
    }

    return response
Exemplo n.º 3
0
    def handle(self, *args, **kwargs):

        TARGET_MODEL = 68
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)
        model = model[0]
        training_df, test_df = get_encoded_logs(job)

        EXPLANATION_TARGET = 2_3300
        FEATURE_TARGET = 1
        shap.initjs()

        explainer = shap.TreeExplainer(model)
        training_df = training_df.drop(['trace_id', 'label'], 1)

        shap_values = explainer.shap_values(training_df)

        encoder = retrieve_proper_encoder(job)
        encoder.decode(training_df, job.encoding)

        shap.force_plot(explainer.expected_value,
                        shap_values[EXPLANATION_TARGET, :],
                        training_df.iloc[EXPLANATION_TARGET, :],
                        show=False,
                        matplotlib=True).savefig('shap_plot_train_1_3.png')
Exemplo n.º 4
0
def _multi_trace_temporal_stability(temporal_stability_exp: Explanation, training_df, test_df):
    if temporal_stability_exp.job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value:
        raise NotImplementedError('Models with cluster-based approach are not yet supported')

    test_df['predicted'] = MODEL[PredictiveModels.CLASSIFICATION.value][ModelActions.PREDICT.value](temporal_stability_exp.job, test_df)

    encoder = retrieve_proper_encoder(temporal_stability_exp.job)

    encoder.decode(df=test_df, encoding=temporal_stability_exp.job.encoding)

    temp_df = DataFrame()
    temp_df['label'] = test_df['predicted']
    encoder.decode(df=temp_df, encoding=temporal_stability_exp.job.encoding)
    test_df['predicted'] = temp_df['label']

    exp_list = {}
    for trace_id in set(test_df['trace_id']):
        df = test_df[test_df['trace_id'] == trace_id].drop(['trace_id', 'label'], 1)
        exp = list(df['predicted'])
        last_row = df.tail(1)
        exp_list_1 = [(feat, str(last_row[feat].values[0])) for feat in last_row]
        exp_list[trace_id] = {
            exp_list_1[index][0]: {'value': exp_list_1[index][1], 'predicted': exp[index]}
            for index in range(len(exp))
        }

    return exp_list
Exemplo n.º 5
0
def get_decoded_df(request, pk):
    job = Job.objects.filter(pk=pk)[0]
    training_df, test_df = get_encoded_logs(job)
    training_df = training_df.drop(['trace_id'], 1)
    encoder = retrieve_proper_encoder(job)
    encoder.decode(training_df, job.encoding)
    return Response(training_df, status=200)
Exemplo n.º 6
0
def lime_temporal_stability(lime_exp: Explanation, training_df, test_df,
                            explanation_target):
    if explanation_target is None:
        return _multi_trace_lime_temporal_stability(lime_exp, training_df,
                                                    test_df)
    else:
        model = joblib.load(lime_exp.predictive_model.model_path)
        if len(model) > 1:
            raise NotImplementedError(
                'Models with cluster-based approach are not yet supported')

        features = list(
            training_df.drop(['trace_id', 'label'], 1).columns.values)
        explainer = _init_explainer(df=training_df.drop(['trace_id', 'label'],
                                                        1).as_matrix(),
                                    features=features,
                                    columns=list(
                                        training_df.drop(['trace_id', 'label'],
                                                         1).columns.values),
                                    mode=getModeType(model[0]))

        explanation_target_df = test_df[test_df['trace_id'] ==
                                        explanation_target].drop(
                                            ['trace_id', 'label'], 1)

        explanation_target_df = explanation_target_df.reset_index(drop=True)

        exp = {
            row.index[max([
                feat for feat in range(len(features))
                if row.index[feat].startswith('prefix') and row[feat] != 0
            ])]: _get_explanation(explainer,
                                  explanation_target_vector=row,
                                  model=model,
                                  features=features).as_list()
            for position, row in explanation_target_df.iterrows()
        }

        encoder = retrieve_proper_encoder(lime_exp.job)

        encoder.decode(df=explanation_target_df,
                       encoding=lime_exp.job.encoding)

        return {
            explanation_target: {
                index: {
                    el[0].split('=')[0]: {
                        'value':
                        explanation_target_df.tail(1)[el[0].split('=')[0]].
                        values[0] if el[0].split('=')[1] != '0' else '',
                        'importance':
                        el[1]
                    }
                    for el in exp[index]
                }
                for index in exp
            }
        }
Exemplo n.º 7
0
    def handle(self, *args, **kwargs):
        # get model
        TARGET_MODEL = 59
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)[0]
        # load data
        training_df, test_df = get_encoded_logs(job)
        training_df['label'] = training_df['label'].astype(bool).astype(int)
        columns = list(training_df.columns.values)
        features = list(
            training_df.drop(['trace_id', 'label'], 1).columns.values)
        feature = 'Age_1'
        feature_grids, percentile_info = _get_grids(
            feature_values=training_df[feature].values,
            num_grid_points=10,
            grid_type=None,
            percentile_range='percentile',
            grid_range=None)
        custom_grids = []
        indexs = []
        for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)):
            custom_grids.append(x)
        print(features)
        fig, axes, summary_df = info_plots.target_plot(
            df=training_df,
            feature=feature,
            feature_name='feature value',
            cust_grid_points=custom_grids,
            target='label',
            show_percentile=False)
        fig.savefig('ice_plot_train_1_3_CType.png')

        lists = list(training_df[feature].values)
        for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)):
            indexs.append(lists.index(x))
        encoder = retrieve_proper_encoder(job)
        encoder.decode(training_df, job.encoding)
        values = training_df[feature].values
        training_df
        lst = []
        print(summary_df)
        if job.encoding.value_encoding != ValueEncodings.BOOLEAN.value:
            for x in range(len(indexs) - 1):
                lst.append({
                    'value': values[indexs[x]],
                    'label': summary_df['label'][x],
                    'count': summary_df['count'][x],
                })
        else:
            for x in range(summary_df.shape[0]):
                lst.append({
                    'value': summary_df['display_column'][x],
                    'label': summary_df['label'][x],
                    'count': summary_df['count'][x],
                })
        print(lst)
Exemplo n.º 8
0
def explain(ice_exp: Explanation, training_df, test_df, explanation_target,
            prefix_target):
    job = ice_exp.job
    training_df = training_df.drop(['trace_id'], 1)
    if job.encoding.value_encoding == ValueEncodings.BOOLEAN.value:
        training_df['label'] = training_df['label'].astype(bool).astype(
            int) + 1

    feature_grids, percentile_info = _get_grids(
        feature_values=training_df[explanation_target].values,
        num_grid_points=10,
        grid_type=None,
        percentile_range='percentile',
        grid_range=None)
    custom_grids = [
        x
        for x in range(int(feature_grids.min()), int(feature_grids.max() - 1))
    ]

    fig, axes, summary_df = info_plots.target_plot(
        df=training_df,
        feature=explanation_target,
        feature_name='feature value',
        cust_grid_points=custom_grids,
        target='label',
        show_percentile=False)
    lists = list(training_df[explanation_target].values)
    indexs = [
        lists.index(x)
        for x in range(int(feature_grids.min()), int(feature_grids.max() - 1))
    ]
    encoder = retrieve_proper_encoder(job)
    encoder.decode(training_df, job.encoding)
    values = training_df[explanation_target].values
    lst = []
    if job.encoding.value_encoding != ValueEncodings.BOOLEAN.value:
        for x in range(len(indexs) - 1):
            lst.append({
                'value': values[indexs[x]],
                'label': summary_df['label'][x],
                'count': int(summary_df['count'][x]),
            })
    else:
        for x in range(summary_df.shape[0]):
            lst.append({
                'value': summary_df['display_column'][x],
                'label': summary_df['label'][x],
                'count': int(summary_df['count'][x]),
            })
    return lst
Exemplo n.º 9
0
def explain(lime_exp: Explanation,
            training_df,
            test_df,
            explanation_target=1,
            prefix_target=None):
    model = joblib.load(lime_exp.predictive_model.model_path)
    if len(model) > 1:
        raise NotImplementedError(
            'Models with cluster-based approach are not yet supported')

    # get the actual explanation
    features = list(training_df.drop(['trace_id', 'label'], 1).columns.values)
    explainer = _init_explainer(df=training_df.drop(['trace_id', 'label'],
                                                    1).as_matrix(),
                                features=features,
                                columns=list(
                                    training_df.drop(['trace_id', 'label'],
                                                     1).columns.values),
                                mode=getModeType(model[0]))

    explanation_target_vector = test_df[
        test_df['trace_id'] == explanation_target].drop(['trace_id', 'label'],
                                                        1).tail(1).squeeze()
    exp = _get_explanation(explainer=explainer,
                           explanation_target_vector=explanation_target_vector,
                           model=model,
                           features=features)

    # show plot
    # exp.show_in_notebook(show_table=True)
    # exp.as_pyplot_figure().show()
    # exp.save_to_file('/tmp/oi.html')

    # alternative visualisation
    # exp.as_map()

    encoder = retrieve_proper_encoder(lime_exp.job)

    exp_list = exp.as_list()

    explanation_target_df = explanation_target_vector.to_frame().T
    encoder.decode(df=explanation_target_df, encoding=lime_exp.job.encoding)

    return {
        e[0].split('=')[0]:
        (str(explanation_target_df[e[0].split('=')[0]].values[0]), e[1])
        for e in exp_list
    }
Exemplo n.º 10
0
def shap_temporal_stability(shap_exp: Explanation, training_df, test_df,
                            explanation_target):
    if explanation_target is None:
        return _multi_trace_shap_temporal_stability(shap_exp, training_df,
                                                    test_df)
    else:
        model = joblib.load(shap_exp.predictive_model.model_path)[0]

        features = list(
            training_df.drop(['trace_id', 'label'], 1).columns.values)
        explainer = _init_explainer(model)

        explanation_target_df = test_df[test_df['trace_id'] ==
                                        explanation_target].drop(
                                            ['trace_id', 'label'], 1)

        explanation_target_df = explanation_target_df.reset_index(drop=True)

        exp = {
            row.index[max([
                feat for feat in range(len(features))
                if row.index[feat].startswith('prefix') and row[feat] != 0
            ])]: _get_explanation(explainer, row)
            for position, row in explanation_target_df.iterrows()
        }

        encoder = retrieve_proper_encoder(shap_exp.job)

        encoder.decode(df=explanation_target_df,
                       encoding=shap_exp.job.encoding)

        return {
            explanation_target: {
                index: {
                    explanation_target_df.keys()[idx]: {
                        'value':
                        explanation_target_df.iloc[list(
                            explanation_target_df.keys()).index('prefix_1')][
                                explanation_target_df.keys()[idx]],
                        'importance':
                        exp[index][1][idx]
                    }
                    for idx in range(len(exp[index][1]))
                }
                for index in exp
            }
        }
Exemplo n.º 11
0
def compute_confusion_matrix(ts, gold, job_obj):
    encoder = retrieve_proper_encoder(job_obj)
    encoder.decode(df=gold, encoding=job_obj.encoding)
    trace_ids = set(gold['trace_id'])
    confusion_matrix = {
        'tp': [
            str(trace_id)
            for trace_id in trace_ids
            if (str(trace_id) in ts) and
               (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] == 'true') and
               (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] ==
                ('true' if boolean(gold[gold['trace_id'] == trace_id]['label'].values[0]) else 'false'))
        ],
        'tn': [
            str(trace_id)
            for trace_id in trace_ids
            if (str(trace_id) in ts) and
               (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] == 'false') and
               (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] ==
                ('true' if boolean(gold[gold['trace_id'] == trace_id]['label'].values[0]) else 'false'))
        ],
        'fp': [
            str(trace_id)
            for trace_id in trace_ids
            if (str(trace_id) in ts) and
               (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] == 'true') and
               (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] !=
                ('true' if boolean(gold[gold['trace_id'] == trace_id]['label'].values[0]) else 'false'))
        ],
        'fn': [
            str(trace_id)
            for trace_id in trace_ids
            if (str(trace_id) in ts) and
               (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] == 'false') and
               (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] !=
                ('true' if boolean(gold[gold['trace_id'] == trace_id]['label'].values[0]) else 'false'))
        ]
    }
    return confusion_matrix
Exemplo n.º 12
0
def get_unique_values(request, pk):
    job = Job.objects.filter(pk=pk)[0]
    training_df, test_df = get_encoded_logs(job)
    decoded_training_df = training_df.copy()
    decoded_testing_df = test_df.copy()
    training_df = training_df.drop(['trace_id', 'label'], 1)

    encoder = retrieve_proper_encoder(job)
    encoder.decode(df=decoded_training_df, encoding=job.encoding)
    encoder.decode(df=decoded_testing_df, encoding=job.encoding)

    result_df = {}
    for key in training_df.keys():
        result_decoded_df = list(
            set(list(training_df[key]) + list(test_df[key])))
        result_encoded_df = list(
            set(
                list(decoded_training_df[key]) +
                list(decoded_testing_df[key])))

        result_df[key] = {}
        for k in range(len(result_decoded_df)):
            result_df[key][result_encoded_df[k]] = result_decoded_df[k]
    return Response(result_df, status=200)
Exemplo n.º 13
0
def _multi_trace_shap_temporal_stability(shap_exp: Explanation, training_df,
                                         test_df):
    #TODO: FIX FROM LIME_WRAPPER TO SHAP_WRAPPER
    model = joblib.load(shap_exp.predictive_model.model_path)[0]
    if len(model) > 1:
        raise NotImplementedError(
            'Models with cluster-based approach are not yet supported')

    features = list(training_df.drop(['trace_id', 'label'], 1).columns.values)
    explainer = _init_explainer(model)

    #TODO: FILTER TO BE REMOVED BEFORE DEPLOY
    # test_df = test_df.head(100)

    exp = {}
    for trace_id in set(test_df['trace_id']):
        df = test_df[test_df['trace_id'] == trace_id].drop(
            ['trace_id', 'label'], 1)
        df = df.reset_index(drop=True)

        if not any([feat.startswith('prefix_')
                    for feat in features]) and len(df) == 1:
            exp[trace_id] = {
                'prefix_': _get_explanation(explainer, row)
                for position, row in df.iterrows()
            }
        else:
            exp[trace_id] = {
                row.index[max([
                    feat for feat in range(len(features))
                    if row.index[feat].startswith('prefix') and row[feat] != 0
                ])]: _get_explanation(explainer, row)
                for position, row in df.iterrows()
            }

    encoder = retrieve_proper_encoder(shap_exp.job)

    encoder.decode(df=test_df, encoding=shap_exp.job.encoding)

    if shap_exp.job.encoding.value_encoding == ValueEncodings.BOOLEAN.value:
        for col in test_df:
            test_df[col] = test_df[col].apply(lambda x: 'False'
                                              if x == '0' else x)

    return {
        trace_id: {
            index: {
                el[0].split('=')[0]: {
                    'value':
                    str(test_df[test_df['trace_id'] == trace_id].tail(1)[
                        el[0].split('=')[0]].values[0])
                    if el[0].split('=')[1] != '0' else '',
                    'importance':
                    el[1]
                }
                for el in exp[trace_id][index]
            }
            for index in exp[trace_id]
        }
        for trace_id in set(test_df['trace_id'])
    }
Exemplo n.º 14
0
def _multi_trace_lime_temporal_stability(lime_exp: Explanation, training_df,
                                         test_df):
    model = joblib.load(lime_exp.predictive_model.model_path)
    if len(model) > 1:
        raise NotImplementedError(
            'Models with cluster-based approach are not yet supported')

    features = list(training_df.drop(['trace_id', 'label'], 1).columns.values)
    explainer = _init_explainer(df=training_df.drop(['trace_id', 'label'],
                                                    1).as_matrix(),
                                features=features,
                                columns=list(
                                    training_df.drop(['trace_id', 'label'],
                                                     1).columns.values),
                                mode=getModeType(model[0]))

    #TODO: FILTER TO BE REMOVED BEFORE DEPLOY
    # test_df = test_df.head(100)

    exp = {}
    for trace_id in set(test_df['trace_id']):
        df = test_df[test_df['trace_id'] == trace_id].drop(
            ['trace_id', 'label'], 1)
        # filterded_df = pd.DataFrame()
        # try:
        #     filterded_df = filterded_df.append(df.head(30).tail(1))
        # except:
        #     pass
        # try:
        #     filterded_df = filterded_df.append(df.head(60).tail(1))
        # except:
        #     pass
        # try:
        #     filterded_df = filterded_df.append(df.head(90).tail(1))
        # except:
        #     pass
        #
        # df = filterded_df
        df = df.reset_index(drop=True)

        if not any([feat.startswith('prefix_')
                    for feat in features]) and len(df) == 1:
            exp[trace_id] = {
                'prefix_': _get_explanation(explainer,
                                            explanation_target_vector=row,
                                            model=model,
                                            features=features).as_list()
                for position, row in df.iterrows()
            }
        else:
            exp[trace_id] = {
                row.index[max([
                    feat for feat in range(len(features))
                    if row.index[feat].startswith('prefix') and row[feat] != 0
                ])]: _get_explanation(explainer,
                                      explanation_target_vector=row,
                                      model=model,
                                      features=features).as_list()
                for position, row in df.iterrows()
            }

        # exp[trace_id] = {
        #     row.index[max([ feat for feat in range(len(features)) if row.index[feat].startswith('prefix') and row[feat] != 0 ])]:
        #         _get_explanation(
        #         explainer,
        #         explanation_target_vector=row,
        #         model=model,
        #         features=features
        #     ).as_list()
        #     for position, row in df.iterrows()
        # }

    encoder = retrieve_proper_encoder(lime_exp.job)

    encoder.decode(df=test_df, encoding=lime_exp.job.encoding)

    if lime_exp.job.encoding.value_encoding == ValueEncodings.BOOLEAN.value:
        for col in test_df:
            test_df[col] = test_df[col].apply(lambda x: 'False'
                                              if x == '0' else x)

    return {
        trace_id: {
            index: {
                el[0].split('=')[0]: {
                    'value':
                    str(test_df[test_df['trace_id'] == trace_id].tail(1)[
                        el[0].split('=')[0]].values[0])
                    if el[0].split('=')[1] != '0' else '',
                    'importance':
                    el[1]
                }
                for el in exp[trace_id][index]
            }
            for index in exp[trace_id]
        }
        for trace_id in set(test_df['trace_id'])
    }