Пример #1
0
    def agglo(data):
        global agglo
        import pandas as pd
        from sklearn import cluster
        agglo = cluster.FeatureAgglomeration(n_clusters=32)

        agglo.fit(data)
        agglo = agglo.transform(data)
        agglo = pd.DataFrame(data=agglo)
def FeatureAgglomeration(array, percent_samples):
    print "Feature Agglomeration", percent_samples * 100, "% of training data."
    print "Features\tTime"

    array = array[:int(percent_samples * len(array))]
    for pct in pct_features_list:
        num_features = int(pct * len(array[0]))
        start = time()
        Y = cluster.FeatureAgglomeration(
            n_clusters=num_features).fit_transform(array)
        end = time()
        print num_features, "\t", (end - start)
def agglomerate(dataset, features_number, clusters_number):
    app_logger.info(
        'STARTED [Feature Agglomeration] on {0} with features number = {1}'.
        format(dataset, features_number),
        extra=LOGGER_EXTRA_OBJECT)

    # Retrieving all feature extracted by tsfresh from the pickles on the disk
    current_dir = os.getcwd().split('\\')[-1]
    projet_dir = 'MCFS-Unsupervisioned-Feature-Selection'
    if current_dir == projet_dir:
        all_features_train = pd.read_pickle(
            'Pickle/AllFeatures/Train/{0}.pkl'.format(dataset))
        all_features_test = pd.read_pickle(
            'Pickle/AllFeatures/Test/{0}.pkl'.format(dataset))
    else:
        all_features_train = pd.read_pickle(
            '../Pickle/AllFeatures/Train/{0}.pkl'.format(dataset))
        all_features_test = pd.read_pickle(
            '../Pickle/AllFeatures/Test/{0}.pkl'.format(dataset))

    app_logger.info(
        'All features (including target column) trainset shape: {0}'.format(
            all_features_train.shape),
        extra=LOGGER_EXTRA_OBJECT)
    app_logger.info(
        'All features (including target column) testset shape: {0}'.format(
            all_features_test.shape),
        extra=LOGGER_EXTRA_OBJECT)

    # Retrieving indipendent columns of both set and known labels of the test set
    indipendent_columns_train = all_features_train.iloc[:, 1:]
    indipendent_columns_test = all_features_test.iloc[:, 1:]
    known_labels_test = all_features_test.iloc[:, 0]

    agglomeration = cluster.FeatureAgglomeration(n_clusters=features_number)
    agglomeration.fit(indipendent_columns_train)
    reduced_train = agglomeration.transform(indipendent_columns_train)
    reduced_test = agglomeration.transform(indipendent_columns_test)
    app_logger.info('Reduced train set: {0}'.format(reduced_train),
                    extra=LOGGER_EXTRA_OBJECT)
    app_logger.info('Reduced test set: {0}'.format(reduced_test),
                    extra=LOGGER_EXTRA_OBJECT)

    # Running k-means according to selected features
    test_feature_selection.testFeatureSelectionWithRepeatedKMeans(
        'AGGLOMERATION', features_number, dataset, reduced_train, reduced_test,
        clusters_number, known_labels_test)

    app_logger.info('ENDED [Feature Agglomeration] on {0}'.format(dataset),
                    extra=LOGGER_EXTRA_OBJECT)
Пример #4
0
def varclus_agglo(df_x):

    import numpy as np
    import pandas as pd
    from sklearn import cluster

    obs = len(df_x.columns)
    
    agglo = cluster.FeatureAgglomeration(n_clusters=int(np.sqrt(obs)))
    agglo.fit(df_x)
    
    varclus_agglo = pd.DataFrame(df_x.columns,columns=["feature_name"])
    varclus_agglo["cluster"]=agglo.labels_
    
    return varclus_agglo.sort_values(by=["cluster"],ascending=True).reset_index(drop=True)
Пример #5
0
def feature_agg(df, drop=None, components=4):

    if drop:
        keep = df[drop]
        df = df.drop(drop, axis=1)

    components = min(df.shape[1] - 1, components)
    agglo = cluster.FeatureAgglomeration(n_clusters=components)
    agglo.fit(df)
    df = pd.DataFrame(agglo.transform(df), index=df.index)
    df = df.add_prefix('feagg_')

    if drop:
        return pd.concat((keep, df), axis=1)
    else:
        return df
def feature_agglomeration(data):
    feature_agglomeration_program = cluster.FeatureAgglomeration(
        50, memory="cache/")
    #new_dataset =
    feature_agglomeration_program.fit(data)
    print(feature_agglomeration_program.labels_)
    print(feature_agglomeration_program.children_)
    reduced_model = feature_agglomeration_program.transform(data)
    np.save("feature_agglo_model", reduced_model)
    feature_groups = collections.defaultdict(list)
    for index, value in enumerate(feature_agglomeration_program.labels_):
        feature_groups[int(value)].append(index)
    with open('feature_agglo_feature_clusters.json', 'w') as fp:
        json.dump(dict(feature_groups), fp)
    for key in feature_groups.keys():
        print("{}: {}".format(key, len(feature_groups[key])))
Пример #7
0
def components(K):
    Sum_of_squared_distances = []
    k=[]
    accuracy_train=[]
    accuracy_test=[]
    score=[]
    for i in range(1,K):
        print(i)
        agglo=cluster.FeatureAgglomeration(n_clusters=i,affinity="precomputed",linkage='complete')
        #X_new_train,y_new_train=transformer.fit(X_train,y_train) 
        #X_new_test,y_new_test = transformer.transform(X_test,y_test)
        agglo.fit(X)
        X_reduced=agglo.transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.20)
        km =MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=[7,7,7,7,7,7,7],random_state=1)
        km.fit(X_train,y_train)
        km.fit(X_test,y_test)
        #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5)
        #transformer2 = GaussianRandomProjection(n_compo
        label_train=km.predict(X_train)
        label_test=km.predict(X_test)
        accu_train=km.score(X_test,y_test)
        accu_test=km.score(X_train,y_train)
        #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean')
        #Sum_of_squared_distances.append(km.inenents=i,eps=0.6)       
        #label=transformer.predicn)rtia_)
        k.append(i)
        accuracy_train.append(accu_train)
        accuracy_test.append(accu_test)
        #score.append(score_train1)
        #print(accuracy)
    k=np.array(k)
    Sum_of_squared_distances=np.array(Sum_of_squared_distances)
    score=np.array(score)
    accuracy_train=np.array(accuracy_train)
    accuracy_test=np.asarray(accuracy_test)
    #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o')
    #line2,=plt.plot(k,score,color='g',marker='o')
    line3,=plt.plot(k,accuracy_train,color='r',marker='o',label='train_accuracy')
    line4,=plt.plot(k,accuracy_test,color='g',marker='o',label='test_accuracy')
    #plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.xlabel('k')
    plt.legend()
    plt.ylabel('accuracy')
    #plt.ylim(0,1)
    plt.show()
    return None
def main():
    digits = datasets.load_digits()
    images = digits.images
    X = np.reshape(images, (len(images), -1))
    connectivity = image.grid_to_graph(*images[0].shape)

    agglo = cluster.FeatureAgglomeration(connectivity=connectivity,
                                         n_clusters=32)

    agglo.fit(X)
    X_reduced = agglo.transform(X)

    X_restored = agglo.inverse_transform(X_reduced)
    images_restored = np.reshape(X_restored, images.shape)
    plt.figure(1, figsize=(4, 3.5))
    plt.clf()
    plt.subplots_adjust(left=.01, right=.99, bottom=.01, top=.91)
    for i in range(4):
        plt.subplot(3, 4, i + 1)
        plt.imshow(images[i],
                   cmap=plt.cm.gray,
                   vmax=16,
                   interpolation='nearest')
        plt.xticks()
        plt.yticks()
        if i == 1:
            plt.title('Original data')
        plt.subplot(3, 4, 4 + i + 1)
        plt.imshow(images_restored[i],
                   cmap=plt.cm.gray,
                   vmax=16,
                   interpolation='nearest')
        if i == 1:
            plt.title("Agglomerated data")

        plt.xticks()
        plt.yticks()

    plt.subplot(3, 4, 10)
    plt.imshow(np.reshape(agglo.labels_, images[0].shape),
               interpolation='nearest',
               cmap=plt.cm.spectral)
    plt.xticks()
    plt.yticks()
    plt.title('Labels')
    plt.show()
Пример #9
0
def train_drfs(train_x, train_y, eps=0.5, threshold="median"):
    n_samples, n_features, n_classes = \
            get_counts_tt(train_x, train_y)

    # pick number of components
    min_comp = random_projection.johnson_lindenstrauss_min_dim( \
            n_samples=n_samples, eps=eps)
    min_comp = min(min_comp, n_features)

    # scale and agglomerate to min_comp
    #scaler = preprocessing.StandardScaler()
    scaler = preprocessing.QuantileTransformer()
    feat_agg = cluster.FeatureAgglomeration( \
            n_clusters=min_comp)
    xtc = ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
    scaler2 = preprocessing.RobustScaler()
    #poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True)

    # train the model pipeline
    dr_pipe = pipeline.Pipeline([('scaler', scaler), \
            ('feat_agg', feat_agg), ('scaler2', scaler2)])

    dr_pipe.fit(train_x)

    # transform train_x to train xtc
    train_x = dr_pipe.transform(train_x)
    # train the xtc
    xtc.fit(train_x, train_y)

    print("Feature importances:")
    print("\tMax:", max(xtc.feature_importances_))
    print("\tMin:", min(xtc.feature_importances_))
    #print(xtc.feature_importances_)

    # create the feature selection model from the xtc
    feat_sel = feature_selection.SelectFromModel( \
            xtc, prefit=True, threshold=threshold)

    # create the pipeline to reduce dim then feature select
    drfs_pipe = pipeline.Pipeline(\
            [('dr_pipe', dr_pipe), ('feat_sel', feat_sel)])

    return drfs_pipe
Пример #10
0
def performance_1(data):
    dataReduced = {
        "queries": transform_to_1(data["queries"]),
        "docs": transform_to_1(data["docs"])
    }

    print("Preparing model")
    model = cluster.FeatureAgglomeration(n_clusters=384)
    print(dataReduced["docs"][0][:10])

    print("Fitting model")
    model.fit(data["docs"])
    dataNew = {
        "docs": model.transform(dataReduced["docs"]),
        "queries": model.transform(dataReduced["queries"]),
    }
    # is not 1bit
    print(dataNew["docs"][0][:10])

    return summary_performance(dataNew)
Пример #11
0
import sys
sys.path.append("src")
from misc.load_utils import read_pickle, center_data, norm_data
from misc.retrieval_utils import rprec_a_ip, rprec_a_l2
import argparse
from sklearn import cluster

parser = argparse.ArgumentParser()
parser.add_argument('--data', default="/data/hp/dpr-c.embd_cn")
parser.add_argument('--seed', type=int, default=0)
args = parser.parse_args()
data = read_pickle(args.data)

print("Preparing model")
model = cluster.FeatureAgglomeration(n_clusters=128)

print("Fitting model")
model.fit(data["docs"])
dataNew = {
    "docs": model.transform(data["docs"]),
    "queries": model.transform(data["queries"]),
}

val_ip_pca = rprec_a_ip(dataNew["queries"],
                        dataNew["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
                        fast=True)
val_l2_pca = rprec_a_l2(dataNew["queries"],
Пример #12
0
import numpy as np
from sklearn import datasets, cluster
import IPython

digits = datasets.load_digits()
images = digits.images
X = np.reshape(images, (len(images), -1))
agglo = cluster.FeatureAgglomeration(n_clusters=32)
agglo.fit(X)

X_reduced = agglo.transform(X)
X_reduced.shape
IPython.embed()
Пример #13
0
def FeatureAgglomeration_cluster():
     global tfidf_matrix
     agglo = cluster.FeatureAgglomeration(n_clusters=32)
     agglo.fit(tfidf_matrix)     
     return agglo.labels_
Пример #14
0
def _eval_search_params(params_builder):
    search_params = {}

    for p in params_builder['param_set']:
        search_list = p['sp_list'].strip()
        if search_list == '':
            continue

        param_name = p['sp_name']
        if param_name.lower().endswith(NON_SEARCHABLE):
            print("Warning: `%s` is not eligible for search and was "
                  "omitted!" % param_name)
            continue

        if not search_list.startswith(':'):
            safe_eval = SafeEval(load_scipy=True, load_numpy=True)
            ev = safe_eval(search_list)
            search_params[param_name] = ev
        else:
            # Have `:` before search list, asks for estimator evaluatio
            safe_eval_es = SafeEval(load_estimators=True)
            search_list = search_list[1:].strip()
            # TODO maybe add regular express check
            ev = safe_eval_es(search_list)
            preprocessings = (
                preprocessing.StandardScaler(), preprocessing.Binarizer(),
                preprocessing.MaxAbsScaler(), preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(), feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(), feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0))
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessings[0:35])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessings[0:7])
                elif obj == 'fs_all':
                    newlist.extend(preprocessings[7:14])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessings[14:25])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessings[25:29])
                elif obj == 'reb_all':
                    newlist.extend(preprocessings[30:35])
                elif obj == 'imb_all':
                    newlist.extend(preprocessings[35:54])
                elif type(obj) is int and -1 < obj < len(preprocessings):
                    newlist.append(preprocessings[obj])
                elif hasattr(obj, 'get_params'):  # user uploaded object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported estimator type: %r" % (obj))

            search_params[param_name] = newlist

    return search_params
Пример #15
0
def preprocess(data, features):

	import pandas as pd
	import numpy as np
	np.random.seed(10)

	N = 3
	print(data.shape)
	row_reduce = data
	for i in range (0, N):
		remove_n = row_reduce.shape[0] // 2
		drop_indices = np.random.choice(row_reduce.index, remove_n, replace=False)
		row_reduce = row_reduce.drop(drop_indices)

	df = row_reduce

	df = df.astype({'DATA[0]': 'str',
					'DATA[1]': 'str',
					'DATA[2]': 'str',
					'DATA[3]': 'str',
					'DATA[4]': 'str',
					'DATA[5]': 'str',
					'DATA[6]': 'str',
					'DATA[7]': 'str',
					'Flag': 'str'})

	print("Preprocessing Y...")
	Y = df.iloc[:, 8]
	le = preprocessing.LabelEncoder()
	Y = le.fit_transform(Y)

	# pd.DataFrame(Y).to_csv("./attack_labels.csv")

	print("Preprocessing X...")
	X = df.iloc[:, 0: 8]
	print(X.shape)


	# LabelEncoder object and fit it to each feature
	print("Encoding X...")
	le = preprocessing.LabelEncoder()
	X = X.apply(le.fit_transform)
	print(X.shape)

	row_reduce = None
	data = None
	df = None

	# OneHotEncoder object, and fit it to all data
	print("One-Hot Encoding X...")
	enc = preprocessing.OneHotEncoder()
	enc.fit(X)
	X = enc.transform(X).toarray()
	print(X.shape)


	from sklearn import datasets, cluster

	print("Performing Feature Agglomeration...")
	agglo = cluster.FeatureAgglomeration(n_clusters = features)
	agglo.fit(X)
	X_reduced = agglo.transform(X)
	print(X_reduced.shape)

	X = None

	return Y, X_reduced
#     print(reducedDataSet.shape)
#
#     labels = model.labels_
#
#     print('labels')
#     print(labels)
#
#     # print('labels')
#     # print(labels)
#
#     # sil = metrics.silhouette_score(X, labels, metric='euclidian', sample_size=5000)

from sklearn.decomposition import PCA
import numpy as np


pca = cluster.FeatureAgglomeration(n_clusters=2)
pca.fit(X)

U, S, VT = np.linalg.svd(X - X.mean(0))

X_train_pca = pca.transform(X)

X_train_pca2 = (X - pca.mean_).dot(pca.components_.T)

X_projected = pca.inverse_transform(X_train_pca)
X_projected2 = X_train_pca.dot(pca.components_) + pca.mean_

loss = ((X - X_projected) ** 2).mean()

print(loss)
def get_search_params(params_builder):
    search_params = {}
    safe_eval = SafeEval(load_scipy=True, load_numpy=True)
    safe_eval_es = SafeEval(load_estimators=True)

    for p in params_builder['param_set']:
        search_p = p['search_param_selector']['search_p']
        if search_p.strip() == '':
            continue
        param_type = p['search_param_selector']['selected_param_type']

        lst = search_p.split(':')
        assert (
            len(lst) == 2
        ), "Error, make sure there is one and only one colon in search parameter input."
        literal = lst[1].strip()
        param_name = lst[0].strip()
        if param_name:
            if param_name.lower() == 'n_jobs':
                sys.exit("Parameter `%s` is invalid for search." % param_name)
            elif not param_name.endswith('-'):
                ev = safe_eval(literal)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name] = ev
            else:
                # only for estimator eval, add `-` to the end of param
                #TODO maybe add regular express check
                ev = safe_eval_es(literal)
                for obj in ev:
                    if 'n_jobs' in obj.get_params():
                        obj.set_params(n_jobs=N_JOBS)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name[:-1]] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name[:-1]] = ev
        elif param_type != 'final_estimator_p':
            #TODO regular express check ?
            ev = safe_eval_es(literal)
            preprocessors = [
                preprocessing.StandardScaler(),
                preprocessing.Binarizer(),
                preprocessing.Imputer(),
                preprocessing.MaxAbsScaler(),
                preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(),
                feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(),
                feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS),
                skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0)
            ]
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessors[0:36])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessors[0:8])
                elif obj == 'fs_all':
                    newlist.extend(preprocessors[8:15])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessors[15:26])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessors[26:30])
                elif obj == 'reb_all':
                    newlist.extend(preprocessors[31:36])
                elif obj == 'imb_all':
                    newlist.extend(preprocessors[36:55])
                elif type(obj) is int and -1 < obj < len(preprocessors):
                    newlist.append(preprocessors[obj])
                elif hasattr(obj, 'get_params'):  # user object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported preprocessor type: %r" % (obj))
            search_params['preprocessing_' + param_type[5:6]] = newlist
        else:
            sys.exit("Parameter name of the final estimator can't be skipped!")

    return search_params
Пример #18
0
def dim_reduction(X, n_components=2, mode="MDS"):
    
    """Reduces the number of dimensions in which a dataset is defined.
    
    Arguments

    X       -   NumPy array with shape (N,M), where N is the number of
                observations, and M the number of features.
    
    Keyword Arguments
    
    n_components    -   Intended number of features after dimensionality
                        reduction. Default = 2
    
    mode            -   String that defines the type of dim reduction:
                        - None
                        - "PCA" principal component analysis
                        - "ICA" independent component analysis
                        - "FA" factor analysis
                        - "TSNE" t-stochastic neighbour embedding
                        - "UMAP" uniform manifold approximation and embedding
                        - "RANDOMPROJECTION"
                        - "FEATUREAGGLOMERATION"
                        - "ISOMAP"
                        - "LLE" local linear embedding
                        - "HESSIAN" Hessian eigenmaps
                        - "MLLE" modified local linear embedding
                        - "LTSA" local tangent space alignment
                        - "MDS" multi-dimensional scaling
                        - "DICTIONARY" dictionary learning
                        - "TSVD" truncated SVD (also known as "LSE")
                        Default = "MDS"
    
    Returns
    
    X       -   NumPy array with shape (N-n,M), where N is the number of
                observations and n is the number of observations with a NaN.
                M is the number of features. Now with scaled values.
    """
    
    # Make sure the mode is in all caps.
    if type(mode) == str:
        mode = mode.upper()
    
    # Copy X into a new matrix.
    X_ = numpy.copy(X)

    # None
    if mode is None or mode == "NONE":
        # Literally nothing happens here for now.
        print("Fart noise!")
        
    # Principal component analysis.
    elif mode == 'PCA':
        # Initialise a new PCA.
        pca = decomposition.PCA(n_components=n_components)
        # Fit the PCA with the data.
        pca.fit(X_)
        # Transform the data.
        X_ = pca.transform(X_)
    
    # Independent component analysis.
    elif mode == 'ICA':
        # Initialise a new ICA.
        ica = decomposition.FastICA(n_components=n_components)
        # Fit the ICA with the data.
        ica.fit(X_)
        # Transform the data.
        X_ = ica.transform(X_)
    
    # Factor analysis.
    elif mode == 'FA':
        # Initialise a new factor analysis.
        fa = decomposition.FactorAnalysis(n_components=n_components)
        # Perform the factor analysis on the data.
        fa.fit(X_)
        # Transform the data.
        X_ = fa.transform(X_)
    
    # T-Distributed stochastic neighbour embedding.
    elif mode == 'TSNE':
        # Run several t-SNEs to find a good one.
        n_runs = 10
        Xs_ = []
        dkl = numpy.ones(n_runs, dtype=float) * numpy.inf
        print("Running %d t-SNEs to find lowest Kullback-Leibler divergence." \
            % (n_runs))
        for i in range(n_runs):
            # Initialise a new t-distributed stochastic neighbouring embedding
            #  (t-SNE) analysis.
            tsne = TSNE(n_components=n_components)
            # Copy the data into a new variable.
            Xs_.append(numpy.copy(X_))
            # Fit to and transform the data.
            Xs_[i] = tsne.fit_transform(Xs_[i])
            # Get the KL-divergence.
            dkl[i] = tsne.kl_divergence_
            print("\tCurrent KL-divergence = %.5f" % (dkl[i]))
        # Choose the solution with the lowest KL-divergence.
        X_ = numpy.copy(Xs_[numpy.argmin(dkl)])
        # Get rid of all the excess X copies.
        del Xs_
    
    # Uniform manifold approximation and projection.
    elif mode == 'UMAP':
        # Create a new UMAP instance.
        um = umap.UMAP(n_components=n_components, min_dist=0.01)
        # Fit and transform X.
        X_ = um.fit_transform(X_)
    
    # Gaussian Random Projection.
    elif mode == 'RANDOMPROJECTION':
        # Create a new GaussianRandomProjection instance.
        rp = GaussianRandomProjection(n_components=n_components)
        # Fit and transform X.
        X_ = rp.fit_transform(X_)
    
    # Feature Agglomeration.
    elif mode == 'FEATUREAGGLOMERATION':
        # Create a new FeatureAgglomeration instance.
        fa = cluster.FeatureAgglomeration(n_clusters=n_components)
        # Fit and transform X.
        X_ = fa.fit_transform(X_)
    
    # Isomap.
    elif mode == 'ISOMAP':
        # Create a new Isomap instance.
        im = Isomap(n_components=n_components)
        # Fit and transform X.
        X_ = im.fit_transform(X_)
    
    # Locally Linear Embedding.
    elif mode == 'LLE':
        # Create a new LocallyLinearEmbedding instance.
        lle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='standard', eigen_solver='dense')
        # Fit and transform X.
        X_ = lle.fit_transform(X_)
    
    # Hessian eigenmaps.
    elif mode == 'HESSIAN':
        # Create a new LocallyLinearEmbedding instance.
        hlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='hessian', eigen_solver='dense')
        # Fit and transform X.
        X_ = hlle.fit_transform(X_)
    
    # MLLE.
    elif mode == 'MLLE':
        # Create a new LocallyLinearEmbedding instance.
        mlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='modified', eigen_solver='dense')
        # Fit and transform X.
        X_ = mlle.fit_transform(X_)
    
    # LTSA.
    elif mode == 'LTSA':
        # Create a new LocallyLinearEmbedding instance.
        ltsa = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='ltsa', eigen_solver='dense')
        # Fit and transform X.
        X_ = ltsa.fit_transform(X_)
    
    # Multi-dimensional scaling.
    elif mode == 'MDS':
        # Create a new MDS instance.
        mds = MDS(n_components=n_components)
        # Fit and transform X.
        X_ = mds.fit_transform(X_)
    
    # Dictionary Learning
    elif mode == "DICTIONARY":
        # Create a DictionaryLearning instance.
        dictlearn = decomposition.DictionaryLearning( \
            n_components=n_components, \
            fit_algorithm='cd', \
            # The 'omp' algorithm orthogonalises the whole thing, whereas
            # a lasso solution with a low alpha leaves a slightly more
            # scattered solution.
            transform_algorithm='lasso_cd', \
            transform_alpha=0.1, \
            )
        # Fit and transform X.
        X_ = dictlearn.fit_transform(X)
    
    # Truncated SVD (also known as 'Latent Semantic analysis' (LSE)
    elif mode in ['TSVD', 'LSE']:
        tsvd = decomposition.TruncatedSVD(n_components=n_components)
        # Fit and transform X.
        X_ = tsvd.fit_transform(X)
    
    else:
        raise Exception("Unrecognised dimensionality reduction mode '%s'" % (mode))
    
    return X_
Пример #19
0
def detect_anomaly(in_data,
                   N_clusters,
                   eng_id,
                   threshold,
                   N_features,
                   n_min=60,
                   steps=80):
    """
	N_features: The number of features to extract from the PCA vector
	n_min = 60 # Minimum place to start the line fit
	steps = 80 # How many steps to take in fitting the line
	"""

    # Some fixed parameters
    savgol_window_size = 81
    out_data = 'savgol_eng_' + str(eng_id) + "/"
    #n_min = 60 # Minimum place to start the line fit
    #threshold = 0.5 # In units of sigma

    try:
        # Create target Directory
        os.mkdir(out_data)
        #print("Directory " , out_data ,  " Created ")
    except FileExistsError:
        print("Directory ", out_data, " already exists")

    # Read in the data
    data = pd.read_csv('data/' + in_data, header=None, delim_whitespace=True)

    # Now we label the columns
    settings = [
        'operational_setting_1', 'operational_setting_2',
        'operational_setting_3'
    ]
    sensors = [
        'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6',
        'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11',
        'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16',
        'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21'
    ]

    cols = ['engine_num', 'time_cycles'] + settings + sensors
    data.columns = cols

    sensor_data = data.drop(settings, axis=1)
    sensor_data = sensor_data[sensor_data['engine_num'] == eng_id]
    sensor_data = sensor_data[sensors]

    # Now we examine the correlations
    eng1_data = sensor_data

    # These three sensors are flat lines
    eng1_data = eng1_data.drop(["sensor_1"], axis=1)
    eng1_data = eng1_data.drop(["sensor_18"], axis=1)
    eng1_data = eng1_data.drop(["sensor_19"], axis=1)

    corr = eng1_data.corr()
    corr = np.abs(corr)

    # plot the heatmap
    plt.clf()
    sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
    plt.title("Engine Number: " + str(eng_id))
    plt.plot()
    plt.savefig(out_data + "corr_data_full_" + in_data + "_sensor_" +
                str(int(eng_id)) + '.pdf',
                bboxes='tight')

    # Now we examine the correlations
    eng1_data = sensor_data

    # These three sensors are flat lines
    eng1_data = eng1_data.drop(["sensor_1"], axis=1)
    eng1_data = eng1_data.drop(["sensor_18"], axis=1)
    eng1_data = eng1_data.drop(["sensor_19"], axis=1)

    # Drop these correlated sensors
    eng1_data = eng1_data.drop(["sensor_5"], axis=1)
    eng1_data = eng1_data.drop(["sensor_6"], axis=1)
    eng1_data = eng1_data.drop(["sensor_10"], axis=1)
    eng1_data = eng1_data.drop(["sensor_16"], axis=1)
    corr = np.abs(eng1_data.corr())

    # plot the heatmap
    plt.clf()
    sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
    plt.plot()
    plt.title("Engine Number: " + str(eng_id))
    plt.savefig(out_data + "corr_data_sub_" + in_data + "_eng_" +
                str(int(eng_id)) + '.pdf',
                bboxes='tight')

    #================================================================
    # Choose the N clusters
    #================================================================

    N_ind_sensors_label = []
    N_ind_sensors_name = []

    corr = np.abs(eng1_data.corr())
    M = np.asarray(corr.iloc[:, :])
    Z = linkage(M, 'single')
    plt.figure(figsize=(25, 10))
    labelsize = 20
    ticksize = 15
    plt.title('Hierarchical Clustering Dendrogram for Sensor Data',
              fontsize=labelsize)
    plt.xlabel('stock', fontsize=labelsize)
    plt.ylabel('distance', fontsize=labelsize)
    dendrogram(
        Z,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=8.,  # font size for the x axis labels
        labels=corr.columns)
    plt.yticks(fontsize=ticksize)
    plt.xticks(rotation=-90, fontsize=ticksize)
    plt.savefig(out_data + "sensor_dendrogram_sub_" + in_data + "_eng_" +
                str(int(eng_id)) + '.pdf',
                bboxes='tight')

    # Lets generate three clusters based on the data
    agglo = cluster.FeatureAgglomeration(n_clusters=N_clusters)
    agglo.fit(M)
    M_reduced = agglo.transform(M)

    cluster_label = agglo.labels_
    data_col = corr.columns

    # Now we find representatives of the N clusters

    # Initialize our array
    N_ind_sensors_label.append(cluster_label[0])
    N_ind_sensors_name.append(data_col[0])

    for k in range(1, len(cluster_label)):

        if (cluster_label[k] not in N_ind_sensors_label):
            N_ind_sensors_label.append(cluster_label[k])
            N_ind_sensors_name.append(data_col[k])

    # Now we examine the correlations
    eng1_data_ind = sensor_data[N_ind_sensors_name]
    corr = np.abs(eng1_data_ind.corr())

    # plot the heatmap
    plt.clf()
    sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
    plt.title("Engine Number: " + str(eng_id))
    plt.plot()
    plt.savefig(out_data + "_corr_data_N_sensors_" + in_data + "_eng_" +
                str(int(eng_id)) + '.pdf',
                bboxes='tight')

    def rms(y):

        s = np.dot(y, y)
        s = s / float(len(y))
        s = np.sqrt(s)

        return s

    def max_peak(y):

        s = np.max(y)

        return s

    def line_int(y):

        s = 0.0

        for i in range(1, len(y)):
            s += np.abs(y[i] - y[i - 1])

        return s

    def energy(y):

        y = y - np.mean(y)
        s = np.dot(y, y)

        return s

    def std(y):

        s = np.std(y)

        return s

    def compute_property(func, y_vec):

        N = len(y_vec)
        y_func = np.zeros(N)

        for i in range(1, N + 1):
            yi = y_vec[0:i]
            fi = func(yi)
            y_func[i - 1] = fi

        return y_func

    # # Next, we compute all features for all of the sensors and combine all of the data into a large feature matrix
    # Generate a new data frame with all of these new features
    X = pd.DataFrame()

    energy_set = ['energy_' + str(int(k)) for k in range(0, N_clusters)]
    rms_set = ['rms_' + str(int(k)) for k in range(0, N_clusters)]
    line_set = ['line_' + str(int(k)) for k in range(0, N_clusters)]
    max_set = ['max_' + str(int(k)) for k in range(0, N_clusters)]
    std_set = ['std_' + str(int(k)) for k in range(0, N_clusters)]

    for k in range(len(energy_set)):
        feature_name_1 = energy_set[k]
        feature_name_2 = rms_set[k]
        feature_name_3 = line_set[k]
        feature_name_4 = max_set[k]
        feature_name_5 = std_set[k]
        X[feature_name_1] = compute_property(energy,
                                             eng1_data_ind.iloc[0:, k].values)
        X[feature_name_2] = compute_property(rms, eng1_data_ind.iloc[0:,
                                                                     k].values)
        X[feature_name_3] = compute_property(line_int,
                                             eng1_data_ind.iloc[0:, k].values)
        X[feature_name_4] = compute_property(max_peak,
                                             eng1_data_ind.iloc[0:, k].values)
        X[feature_name_5] = compute_property(std, eng1_data_ind.iloc[0:,
                                                                     k].values)

    all_features = energy_set + rms_set + line_set + max_set + std_set

    # # The feature matrix has high dimensionality, use PCA to find the first two Priciple components of the feature matrix

    # In[9]:

    #====================================================================================
    # PCA Analysis
    #====================================================================================
    pca = PCA(n_components=2)

    # Scale all of the features
    X = X.loc[:, all_features].values
    X = StandardScaler().fit_transform(X)

    principalComponents = pca.fit_transform(X)

    principalDf = pd.DataFrame(data=principalComponents,
                               columns=['pc1', 'pc2'])

    print('Explained Variance: ', pca.explained_variance_ratio_)

    V1 = principalDf['pc1']
    V2 = principalDf['pc2']

    plt.clf()
    plt.title("PCA 1")
    plt.plot(V1)
    plt.xlabel("Cycles", size=20)
    plt.ylabel("PCA [unitless]", size=20)
    plt.savefig(out_data + "PCA1_" + in_data + "_eng_" + str(int(eng_id)) +
                '.pdf',
                bboxes='tight')

    #====================================================================================
    # Bayesian Fit
    #====================================================================================

    #savgol_filter(y, 11, 3) # window size 51, polynomial order 3
    x = range(len(V1))
    y1 = savgol_filter(V1, savgol_window_size, 3)
    y2 = savgol_filter(V2, savgol_window_size, 3)

    plt.clf()
    plt.title("PCA 1 Savgol Filter")
    plt.plot(x, y1)
    plt.xlabel("Cycles", size=20)
    plt.ylabel("PCA [unitless]", size=20)
    plt.savefig(out_data + "PCA1_filter_" + in_data + "_eng_" +
                str(int(eng_id)) + '.pdf',
                bboxes='tight')

    def bayesian_fit(x, y):
        def lnlike(theta, x, y):
            a1, b, sigma = theta
            model = a1 * x + b
            inv_sigma2 = 1.0 / sigma**2
            return -0.5 * (np.sum((y - model)**2 * inv_sigma2 -
                                  np.log(inv_sigma2)))

        def lnprob(theta, x, y):
            #lp = lnprior(theta)

            #return lp + lnlike(theta, x, y, yerr)
            return lnlike(theta, x, y)

        nll = lambda *args: -lnlike(*args)
        result = op.minimize(nll, [1.0, 1.0, 1.0], args=(x, y))
        a1_ml, b_ml, sigma_ml = result["x"]

        ndim, nwalkers = 3, 8
        pos = [
            result["x"] + 1e-4 * np.random.randn(ndim) for i in range(nwalkers)
        ]

        sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y))

        sampler.run_mcmc(pos, 100)

        samples = sampler.chain[:, 50:, :].reshape((-1, ndim))

        return samples

    Nk = []
    a1_k = []
    a2_k = []
    a2_error_k = []
    a1_error_k = []
    sse_k = []

    n_max = len(y1)

    dh = int(float(n_max - n_min) / float(steps))

    for N in range(n_min, n_max, dh):

        x_N = x[0:N]
        y_N = y1[0:N]

        samples = bayesian_fit(x_N, y_N)

        a1_N = np.mean(samples[:, 0])
        b_N = np.mean(samples[:, 1])

        a1_k.append(a1_N)

        model_k = b_N + a1_N * np.asarray(x_N)
        sse = np.dot(y_N - model_k, y_N - model_k)

        a1_error_k = np.std(samples[:, 0])
        sse_k.append(sse)
        Nk.append(N)

    plt.clf()
    plt.title('Linear coefficient')
    plt.plot(Nk, a1_k)
    plt.ylabel('$a_1$', size=20)
    plt.xlabel('Cycles', size=20)
    plt.savefig(out_data + "linear_coeff_" + in_data + "_eng_" +
                str(int(eng_id)) + '.pdf',
                bboxes='tight')

    plt.clf()
    plt.plot(Nk, sse_k, '-o', color='blue')
    plt.xlabel("Cycles", size=20)
    plt.ylabel('Residuals', size=20)
    plt.savefig(out_data + "residuals_" + in_data + "_eng_" +
                str(int(eng_id)) + '.pdf',
                bboxes='tight')

    plt.clf()
    sse2_k = np.gradient(sse_k, 2)
    plt.title("Residual Acc. vs Cycle", size=20)
    plt.plot(Nk, sse2_k, '-o', color='green')
    plt.xlabel("Cycles", size=20)
    plt.ylabel("Residual Acceleration", size=20)
    plt.savefig(out_data + "residual_acc_" + in_data + "_eng_" +
                str(int(eng_id)) + '.pdf',
                bboxes='tight')

    #============================================================================================
    # Now we determine the range of the anomalies
    # Apply the standard Scaler to the SSE-acceleration data
    sse2_k_median = np.median(sse2_k)
    sse2_k_std = np.std(sse2_k)
    sse2_k_scaled = np.abs(sse2_k - sse2_k_median) / sse2_k_std

    # Now normalize from zero to one
    sse2_k_max = np.max(sse2_k_scaled)
    sse2_k_min = np.min(sse2_k_scaled)
    sse2_k_scaled = (sse2_k_scaled - sse2_k_min) / (sse2_k_max - sse2_k_min)

    # Define a possible threshold for the failure region
    Nk_anom = []

    for i in range(len(sse2_k)):

        # Find all points that have an anomaly
        if (sse2_k_scaled[i] >= threshold):
            Nk_anom.append(Nk[i])

    if (len(Nk_anom) != 0):
        plt.clf()
        [
            plt.axvline(Ni, alpha=1.0, color='red', linewidth=2.0)
            for Ni in Nk_anom
        ]
        plt.axvspan(Nk_anom[0], Nk_anom[-1], alpha=0.2, color='purple')
        plt.plot(Nk, sse2_k_scaled, '-o', color='green')
        plt.title("Residual Acc. Scaled vs Cycle", size=20)
        plt.xlabel("Cycles", size=20)
        plt.ylabel("Residual Acceleration", size=20)
        plt.savefig(out_data + "_scaled_residual_acc_" + in_data + "_eng_" +
                    str(int(eng_id)) + '.pdf',
                    bboxes='tight')

    for k in range(0, len(N_ind_sensors_name)):
        plt.clf()
        plt.title(N_ind_sensors_name[k], size='20')
        plt.plot(eng1_data_ind.iloc[:, k].values)

        if (len(Nk_anom) != 0):
            [
                plt.axvline(Ni, alpha=0.5, color='red', linewidth=3.0)
                for Ni in Nk_anom
            ]
        #	plt.axvspan(Nk_anom[0],Nk_anom[-1],alpha=0.2, color='purple')

        plt.xlabel("Cycles", size=20)
        plt.legend()
        plt.savefig(out_data + "sensor_" + str(N_ind_sensors_name[k]) +
                    "_anomaly_" + in_data + "_eng_" + str(int(eng_id)) +
                    '.pdf',
                    bboxes='tight')

    plt.clf()
    plt.plot(x, y1)
    if (len(Nk_anom) != 0):
        [
            plt.axvline(Ni, alpha=0.3, color='red', linewidth=3.0)
            for Ni in Nk_anom
        ]
        #plt.axvspan(Nk_anom[0],Nk_anom[-1],alpha=0.2, color='purple')
    plt.ylabel("PCA1 filter", size=20)
    plt.xlabel("Cycles", size=20)
    plt.savefig(out_data + "PCA_" + in_data + "_eng_" + str(int(eng_id)) +
                '.pdf',
                bboxes='tight')

    y_feature = x[-1] - x[N_features]
    x_feature = y1[0:N_features]
    return x_feature, y_feature
Пример #20
0
    def transform(self, results_file='', short_texts_length=15):
        """
        Classify texts for each provider and save predictions

        :param results_file: path to previously computed predictions
        :param short_texts_length: length of short texts for different objects
        """
        if path.exists(results_file):
            self.load_results(results_file)
            return
        file_names = os.listdir(self.data_directory)
        paths = [self.data_directory + '/' + name for name in file_names]
        ids_vector = [name.split('-')[0] for name in file_names]
        categories_vector = [name.split('-')[1] for name in file_names]
        ratings_vector = [
            int(name.split('-')[2].split('.')[0]) for name in file_names
        ]
        #features = texts_to_vectors(paths)

        features, ratings_vector, categories_vector, ids_vector, paths = divide_texts(
            paths,
            ratings_vector,
            categories_vector,
            ids_vector,
            n=short_texts_length)

        # Feature Agglomeration
        if self.feature_agglomeration:
            agglomeration = cluster.FeatureAgglomeration(n_clusters=5)
            agglomeration.fit(features)
            features_reduced = agglomeration.transform(features)
            features = features_reduced

        self.unique_ratings = sorted(list(set(ratings_vector)))
        unique_ids = list(set(ids_vector))

        # Object selection
        if self.selection == 'none':
            selected_features = features
            selected_ids_vector = ids_vector
            selected_ratings_vector = ratings_vector
        elif self.selection == 'kmeans':
            selected_features, selected_ids_vector, selected_ratings_vector = self.selection_kmeans(
                ids_vector, ratings_vector, features)
        elif self.selection == 'random':
            selected_features, selected_ids_vector, selected_ratings_vector = self.selection_random(
                ids_vector, ratings_vector, features)
        elif self.selection == 'silhouette':
            selected_features, selected_ids_vector, selected_ratings_vector = self.selection_silhouette(
                ids_vector, ratings_vector, features, categories_vector)
        true_ratings_object = {}
        predicted_ratings_object = {}
        predicted_ratings_vector = []
        true_ratings_vector = []
        paths_object = {}
        ids_object = {}

        if self.algorithm == 'knn':
            model = KNeighborsClassifier(n_neighbors=3)
        elif self.algorithm == 'lr':
            model = linear_model.Lasso(alpha=0.1)
        else:
            model = RandomForestClassifier()

        for current_id in unique_ids:
            # Images for current_id to test set and other images to train set
            test_indexes = []
            train_indexes = []
            for index, img_id in enumerate(ids_vector):
                if img_id == current_id:
                    test_indexes.append(index)

            for index, img_id in enumerate(selected_ids_vector):
                if img_id != current_id:
                    train_indexes.append(index)
            train_X = selected_features[train_indexes, :]
            test_X = features[test_indexes, :]

            train_y = [selected_ratings_vector[j] for j in train_indexes]
            test_y = [ratings_vector[j] for j in test_indexes]

            if len(test_y) == 0:
                continue

            model.fit(train_X, train_y)
            predictions = model.predict(test_X)

            # Save to object
            predicted_ratings_object[current_id] = predictions
            true_ratings_object[current_id] = test_y
            paths_object[current_id] = [
                paths[test_index] for test_index in test_indexes
            ]
            ids_object[current_id] = [
                ids_vector[test_index] for test_index in test_indexes
            ]

            # Save to vector
            predicted_ratings_vector.extend(predictions)
            true_ratings_vector.extend(test_y)

        # Save to class properties
        self.predicted_ratings_object = predicted_ratings_object
        self.true_ratings_object = true_ratings_object
        self.predicted_ratings_vector = predicted_ratings_vector
        self.true_ratings_vector = true_ratings_vector
        self.paths_object = paths_object
        self.ids_object = ids_object

        # Save predictions to a file
        self.save_results(results_file)
Пример #21
0
# 4. 连通性约束的聚类
import matplotlib.pyplot as plt

from skimage.data import coins
from skimage.transform import rescale

from sklearn.feature_extraction.image import grid_to_graph
from sklearn.cluster import AgglomerativeClustering

orig_coins = coins()
# 使用高斯滤波对其进行平滑,然后缩小便于处理
# smoothened_coins = gaussian_filter(orig_coins, sigma=2)   # 这个高斯滤波有问题
# rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect")
# X = np.reshape(rescaled_coins, (-1, 1))
# connectivity = grid_to_graph(*rescaled_coins.shape)

# 5. 特征聚集(将类似的特征合并在一起)  看的有点迷糊,先把代码粘下来吧
import numpy as np

digits = datasets.load_digits()
images = digits.images
X = np.reshape(images, (len(images), -1))
connectivity = grid_to_graph(*images[0].shape)

agglo = cluster.FeatureAgglomeration(connectivity=connectivity, n_clusters=32)
agglo.fit(X)

X_reduced = agglo.transform(X)

X_approx = agglo.inverse_transform(X_reduced)
images_approx = np.reshape(X_approx, images.shape)
preprocess_pipeline = compose.ColumnTransformer([
    ('cat', categorical_pipeline, cat_features),
    ('num', numerical_pipeline, num_features)
])
house_train2 = preprocess_pipeline.fit_transform(house_train1)

#outlier detection pipeline
outlier_pipeline = pipeline.Pipeline([
    ('preprocess', preprocess_pipeline),
    ('outlier_estimator', ensemble.IsolationForest(contamination=0.01))
])
labels = outlier_pipeline.fit_predict(house_train1)
house_train1[labels == -1]

#add clustering label as new featuer
clustering = cluster.AgglomerativeClustering(n_clusters=5)
clustering.fit(house_train2)
house_train2['house_group'] = clustering.labels_

#feature reduction by clustering related features
cluster_pipeline = pipeline.Pipeline([
    ('preprocess', preprocess_pipeline),
    ('cluster_features', cluster.FeatureAgglomeration(n_clusters=50))
])
cluster_data = cluster_pipeline.fit_transform(house_train1)

#feature reduction on top of correlation matrix
corr_matrix = np.corrcoef(house_train2, rowvar=False)
clustering = cluster.AgglomerativeClustering(n_clusters=10)
clustering.fit(corr_matrix)