示例#1
0
def stability(filtered_data, features_list, iteration):
    """
    This function repeatedly sample the data and calculate the average feature size and stability score for each anomalies
    :param filtered_data: list of data
    :param features_list: list of features
    :param iteration: number of iteration
    :return:
    stability matrix and feature list
    """
    feature_list_result = []
    for i in range(iteration):
        new_data = random_data(filtered_data)
        index_data = calculate_class_entropy(new_data, "stability")
        new_data = select_segment(data, index_data)
        data_segment_entropy = calculate_segment_entropy(new_data)
        distance = calculate_D(data_segment_entropy, index_data['h_class'])
        for j in range(len(distance)):
            correlated_feature_index = remove_monotonic_feature(filtered_data[j], features_list)
            Exstream_feature, Exstream_data = drop_features(distance[j, :], filtered_data[j], features_list,
                                                            correlated_feature_index)
            if len(Exstream_feature) == 1:
                feature_list_result.append(Exstream_data.columns[:-1].values)
            else:
                Exstream_cluster = remove_correlated_features(Exstream_data, Exstream_feature, features_list,
                                                              distance[j, :])
                feature_list_result.append(Exstream_cluster.columns[:-1].values)

    stability_matrix = np.zeros((2, len(distance)))
    list = np.array(feature_list_result)
    for i in range(len(distance)):
        index = np.array(range(i, len(list), len(distance)))
        temp = list[index]
        avg_size, stability = stats(temp)
        stability_matrix[:, i] = avg_size, stability

    return stability_matrix, feature_list_result
示例#2
0
        index_data_class_entropy = calculate_class_entropy(index_data_mapped)
        filtered_data = select_segment(data, index_data_class_entropy)
        aggregated_data = combine_data(filtered_data)
        index_data = calculate_class_entropy(aggregated_data, "aggregate")
        data_segment_entropy = pd.read_csv(
            os.path.join(path_segment, file_segment))
        #data_segment_entropy = calculate_segment_entropy(aggregated_data, "aggregate")
        distance = calculate_D(data_segment_entropy, index_data['h_class'])
        features_list = data_segment_entropy.columns
        correlated_feature_index = remove_monotonic_feature(
            aggregated_data, features_list)
        Exstream_feature, Exstream_data = drop_features(
            distance[0], aggregated_data, features_list,
            correlated_feature_index)
        Exstream_cluster = remove_correlated_features(Exstream_data,
                                                      Exstream_feature,
                                                      features_list,
                                                      distance[0])
        Exstream_list.append(len(Exstream_feature))
        Exstream_cluster_list.append(len(Exstream_cluster.columns) - 1)
        # print(Exstream_cluster.columns)

    # Set up plot data
    plotdata = pd.DataFrame(
        {
            'Exstream': Exstream_list,
            'Exstream_cluster': Exstream_cluster_list
        },
        index=['batch146_17', 'batch146_19', 'batch146_20'])
    # %%
    # Plot figure 15
    plotdata.reset_index().plot(x='index',
    # numpy array len(feature) x 1
    aggregated_distance = aggreate_distance(distance)
    # convert the list of data frames to one data
    aggregated_data = combine_data(filtered_data)
    #list of all the features
    features_list = data_segment_entropy.columns
    correlated_feature_index = remove_monotonic_feature(
        aggregated_data, features_list)
    Exstream_feature, Exstream_data = drop_features(aggregated_distance,
                                                    aggregated_data,
                                                    features_list,
                                                    correlated_feature_index)

    # after removing correlated features (via clustering) we will have Exstream_cluster
    Exstream_cluster = remove_correlated_features(Exstream_data,
                                                  Exstream_feature,
                                                  features_list,
                                                  aggregated_distance)
    print(Exstream_cluster.columns)

    ### Prediction
    ### Get a dictionary of prediction ranges for each feature
    prediction_range_dict = get_prediction_range(Exstream_cluster)

    ### For training performance:
    test_data = aggregated_data
    test_data = test_data.reset_index()

    ### Get predicted data with "label_predict" column
    predicted_data = predict(test_data, prediction_range_dict, 4)

    ### Only for training data: