示例#1
0
def project(file_name, dimensions):
    data = load_svmlight_file(file_name)
    projector = SparseRandomProjection(dimensions, 1/3.0,
                                       dense_output=True)
    projected = projector.fit_transform(data[0])
    
    new_file_name = file_name[:-4] + '-' + str(dimensions) + '.mat'
    new_file = open(new_file_name, 'wb')
    dump_svmlight_file(projected, data[1], new_file)
def plotProjection(data, n_samples, n_features): 

    n_components_range = np.array([300, 1000, 10000])
    dists = euclidean_distances(data, squared=True).ravel()

    # select only non-identical samples pairs
    nonzero = dists != 0
    dists = dists[nonzero]

    for n_components in n_components_range:

        t0 = time()

        rp = SparseRandomProjection(n_components=n_components)
        projected_data = rp.fit_transform(data)

        print("Projected %d samples from %d to %d in %.3fs" \
                % (n_samples, \
                   n_features, \
                   n_components, \
                   time() - t0))

        if hasattr(rp, 'components_'):
            n_bytes = rp.components_.data.nbytes
            n_bytes += rp.components_.indices.nbytes
            print("Random matrix with size: %.3fMB" % (n_bytes / 1e6))


        projected_dists = euclidean_distances(projected_data, squared=True)
        projected_dists = projected_dists.ravel()[nonzero]

        rates = projected_dists / dists
        print("Mean distances rate: %.2f (%.2f)" \
                % (np.mean(rates), \
                   np.std(rates)))

        plotHexbin(dists, projected_dists, n_components)
        plotHist(rates, n_components)
def test_SparseRandomProjection_output_representation():
    for SparseRandomProjection in all_SparseRandomProjection:
        # when using sparse input, the projected data can be forced to be a
        # dense numpy array
        rp = SparseRandomProjection(n_components=10, dense_output=True, random_state=0)
        rp.fit(data)
        assert isinstance(rp.transform(data), np.ndarray)

        sparse_data = sp.csr_matrix(data)
        assert isinstance(rp.transform(sparse_data), np.ndarray)

        # the output can be left to a sparse matrix instead
        rp = SparseRandomProjection(n_components=10, dense_output=False, random_state=0)
        rp = rp.fit(data)
        # output for dense input will stay dense:
        assert isinstance(rp.transform(data), np.ndarray)

        # output for sparse output will be sparse:
        assert sp.issparse(rp.transform(sparse_data))
示例#4
0
文件: util.py 项目: xiaohan2012/mynlp
def create_sector_subset(sample_n, X_output_path, Y_output_path):
    X_path = "/cs/puls/Experiments/hxiao-test/feature-data.mat"
    Y_path = "/cs/puls/Experiments/hxiao-test/label-data.mat"

    X = loadmat(X_path)["featureData"]
    Y = loadmat(Y_path)["labelData"]

    print "Applying random projection to reduce dimension"
    print "Shape before: %r" % (X.shape,)

    transformer = SparseRandomProjection(random_state=0)
    X = transformer.fit_transform(X)
    print "Shape after: %r" % (X.shape,)
    print "Random projection: OFF"

    rng = np.random.RandomState(0)
    print "Sample size: %d" % sample_n
    rows = rng.permutation(X.shape[0])[:sample_n]
    X = X[rows, :]
    Y = Y[rows, :]

    dump(X, open(X_output_path, "w"))
    dump(Y, open(Y_output_path, "w"))
def main():
    global global_gen_data
    global total_length
    with open('feature_select_list.pkl', 'r') as f:
        feature_select_list = pickle.load(f)
    #pdb.set_trace()
    cores = multiprocessing.cpu_count()
    #21
    for file_number in xrange(1):
        with open('../order_100_data/order_data_chunk_' + str(file_number),
                  'r') as f:
            file_list = f.readlines()
            print('read done:' + str(file_number))
            get_all_label(file_list)
#    cores = multiprocessing.cpu_count()
#    pool = multiprocessing.Pool(processes=(cores-2))

#pdb.set_trace()
#print('length: ',len(all_label_result['usercategories']))
    cut_num = 2000
    control_feature_length(cut_num)
    #save_pickle(all_label_result,'all_label.pkl')
    #pdb.set_trace()
    for feature in total_list:
        enc, one_hot = get_all_onehot(feature, list(all_label_result[feature]))
        all_label_encoder[feature].extend([enc, one_hot])
# rewards = []
# items_id = []
# uin = []
# for file_number in range(2,16):
#     with open('../order_100_event_data/order_data_id_label_chunk_' + str(file_number), 'r') as f:
#         file_list = f.readlines()
#         #pdb.set_trace()
#         for line in file_list:
#             line_list = line.split('\t')
#             #if len(line_list) < 3:
#                 #print(line_list)
#             rewards.append(line_list[1])
#             items_id.append(line_list[0])
#             uin.append(line_list[2].strip('\n'))

    for line in cross_lines:
        cross_feat = line.strip().split()
        feat_a = cross_feat[0]
        feat_b = cross_feat[1]
        total_length += (feature_length_result[feat_a] *
                         feature_length_result[feat_b])

    srp = SparseRandomProjection(n_components=1000)
    print('total_d_length', total_length)
    for file_number in xrange(0, 4):
        rewards = []
        items_id = []
        uin = []
        with open(
                '../order_new_pool_data/order_data_id_label_chunk_' +
                str(file_number), 'r') as f:
            file_list = f.readlines()
            #pdb.set_trace()
            for line in file_list:
                line_list = line.split('\t')
                #if len(line_list) < 3:
                #print(line_list)
                rewards.append(line_list[1])
                items_id.append(line_list[0])
                uin.append(line_list[2].strip('\n'))
        with open(
                '../order_new_pool_data/order_data_chunk_' + str(file_number),
                'r') as f:
            file_list = f.readlines()
            #pdb.set_trace()
            gen_data = generate_key_value_data(file_list)
        with open('../order_new_pool_data/length_chunk_' + str(file_number),
                  'r') as f:
            cut_pool_list = pickle.load(f)
        #gen_data = gen_data[0:100]
        print('start file: ' + str(file_number))
        print('number chunk', len(cut_pool_list) / 4000)
        chunk_file_number = len(cut_pool_list) / 4000
        pdb.set_trace()
        cut_start_flag = 0
        for block_num in range(chunk_file_number):
            print('-------------------------------')
            print('strat block: ' + str(block_num + 1))
            cut_pool = cut_pool_list[block_num * 4000:(block_num + 1) * 4000]
            cut_end = sum(cut_pool)
            print('chunk_range: ', cut_start_flag, cut_end + cut_start_flag)
            data_todeal = gen_data[cut_start_flag:(cut_end + cut_start_flag)]
            rewards_todeal = rewards[cut_start_flag:(cut_end + cut_start_flag)]
            items_todeal = items_id[cut_start_flag:(cut_end + cut_start_flag)]
            uin_todeal = uin[cut_start_flag:(cut_end + cut_start_flag)]
            cut_start_flag += cut_end
            pdb.set_trace()
示例#6
0
def DecomposedFeatures(train,
                       test,
                       total,
                       addtrain,
                       addtest,
                       use_pca=0.0,
                       use_tsvd=0.0,
                       use_ica=0.0,
                       use_fa=0.0,
                       use_grp=0.0,
                       use_srp=0.0,
                       use_pls=0.0):
    print("\nStart decomposition process...")
    train_decomposed = [addtrain]
    test_decomposed = [addtest]
    if use_pca > 0.0:
        print("PCA")
        N_COMP = int(use_pca * train.shape[1]) + 1
        pca = PCA(n_components=N_COMP,
                  whiten=True,
                  svd_solver="full",
                  random_state=42)
        pca_results = pca.fit(total)
        pca_results_train = pca.transform(train)
        pca_results_test = pca.transform(test)
        train_decomposed = train_decomposed.append(pca_results_train)
        test_decomposed = test_decomposed.append(pca_results_test)

    if use_tsvd > 0.0:
        print("tSVD")
        N_COMP = int(use_tsvd * train.shape[1]) + 1
        tsvd = TruncatedSVD(n_components=N_COMP, random_state=42)
        tsvd_results = tsvd.fit(total)
        tsvd_results_train = tsvd.transform(train)
        tsvd_results_test = tsvd.transform(test)
        train_decomposed = train_decomposed.append(tsvd_results_train)
        test_decomposed = test_decomposed.append(tsvd_results_test)

    if use_ica > 0.0:
        print("ICA")
        N_COMP = int(use_ica * train.shape[1]) + 1
        ica = FastICA(n_components=N_COMP, random_state=42)
        ica_results = ica.fit(total)
        ica_results_train = ica.transform(train)
        ica_results_test = ica.transform(test)
        train_decomposed = train_decomposed.append(train_decomposed)
        test_decomposed = test_decomposed.append(ica_results_test)

    if use_fa > 0.0:
        print("FA")
        N_COMP = int(use_fa * train.shape[1]) + 1
        fa = FactorAnalysis(n_components=N_COMP, random_state=42)
        fa_results = fa.fit(total)
        fa_results_train = fa.transform(train)
        fa_results_test = fa.transform(test)
        train_decomposed = train_decomposed.append(fa_results_train)
        test_decomposed = test_decomposed.append(fa_results_test)

    if use_grp > 0.0:
        print("GRP")
        N_COMP = int(use_grp * train.shape[1]) + 1
        grp = GaussianRandomProjection(n_components=N_COMP,
                                       eps=0.1,
                                       random_state=42)
        grp_results = grp.fit(total)
        grp_results_train = grp.transform(train)
        grp_results_test = grp.transform(test)
        train_decomposed = train_decomposed.append(grp_results_train)
        test_decomposed = test_decomposed.append(grp_results_test)

    if use_srp > 0.0:
        print("SRP")
        N_COMP = int(use_srp * train.shape[1]) + 1
        srp = SparseRandomProjection(n_components=N_COMP,
                                     dense_output=True,
                                     random_state=42)
        srp_results = srp.fit(total)
        srp_results_train = srp.transform(train)
        srp_results_test = srp.transform(test)
        train_decomposed = train_decomposed.append(srp_results_train)
        test_decomposed = test_decomposed.append(srp_results_test)

    if use_pls > 0.0:
        print("PLS")
        #N_COMP = int(use_pls  * train.shape[1]) +1
        #pls = PLSCanonical(n_components = N_COMP)
        #pls_results = pls.fit(total)
        #pls_results_train = pls.transform(train)
        #pls_results_test = pls.transform(test)
        #train_decomposed = np.concatenate([pls_results_train,train_decomposed], axis=1)
        #test_decomposed = np.concatenate([pls_results_test, test_decomposed], axis=1)

    print("Append decomposition components together...")

    train_decomposed = np.concatenate(train_decomposed, axis=1)
    test_decomposed = np.concatenate(test_decomposed, axis=1)
    train_with_only_decomposed_features = pd.DataFrame(train_decomposed)
    test_with_only_decomposed_features = pd.DataFrame(test_decomposed)

    #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']:
    #    train_with_only_decomposed_features[col] = train[col]
    #    test_with_only_decomposed_features[col] = test[col]

    np.concatenate([
        srp_results_train, grp_results_train, ica_results_train,
        pca_results_train, tsvd_results_train
    ],
                   axis=1)
    # Remove any NA
    train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(
        0)
    test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(
        0)

    return train_with_only_decomposed_features, test_with_only_decomposed_features
def test_SparseRandomProjection_output_representation():
    for SparseRandomProjection in all_SparseRandomProjection:
        # when using sparse input, the projected data can be forced to be a
        # dense numpy array
        rp = SparseRandomProjection(n_components=10,
                                    dense_output=True,
                                    random_state=0)
        rp.fit(data)
        assert isinstance(rp.transform(data), np.ndarray)

        sparse_data = sp.csr_matrix(data)
        assert isinstance(rp.transform(sparse_data), np.ndarray)

        # the output can be left to a sparse matrix instead
        rp = SparseRandomProjection(n_components=10,
                                    dense_output=False,
                                    random_state=0)
        rp = rp.fit(data)
        # output for dense input will stay dense:
        assert isinstance(rp.transform(data), np.ndarray)

        # output for sparse output will be sparse:
        assert sp.issparse(rp.transform(sparse_data))
示例#8
0
                                   n_features=256,
                                   sep=',',
                                   header=None)

wineX = StandardScaler().fit_transform(wineX)
digitX = StandardScaler().fit_transform(digitX)

clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
dims_wine = [i for i in range(2, 12)]

# data for 1

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_wine):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'wine scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitX), digitX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digit scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_wine):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(wineX)
示例#9
0
def main():
    runit = 1
    if runit:
        run = assignment4()
        run.read_data_voice('voice.csv')
        run.dataSetName = 'Voice'
        run.split_data_to_train_test(testSize=0.3)
        dataX = StandardScaler().fit_transform(run.allFeatures)
        ''' 
    run.PCA()
    run.ICA()
    run.RP()
    '''
        run.TSVD()
        run.k_mean_cluster()
        run.expectation_maximization()
        pcaCom = 15
        icaCom = 15
        rpCom = 15
        tsvdCom = 15
        k = 2
        reducedDataPCA = PCA(n_components=pcaCom,
                             random_state=5).fit_transform(dataX)
        run.k_mean_cluster_reduced(k, reducedDataPCA, 'PCA')
        run.expectation_maximization_reduced(k, reducedDataPCA, 'PCA')

        reducedDataICA = FastICA(n_components=icaCom,
                                 random_state=5).fit_transform(dataX)
        run.k_mean_cluster_reduced(k, reducedDataICA, 'ICA')
        run.expectation_maximization_reduced(k, reducedDataICA, 'ICA')

        reducedDataRP = SparseRandomProjection(
            n_components=rpCom, random_state=5).fit_transform(dataX)
        run.k_mean_cluster_reduced(k, reducedDataRP, 'RP')
        run.expectation_maximization_reduced(k, reducedDataRP, 'RP')

        reducedDataTSVD = TruncatedSVD(
            random_state=5, n_components=tsvdCom).fit_transform(dataX)
        run.k_mean_cluster_reduced(k, reducedDataTSVD, 'TSVD')
        run.expectation_maximization_reduced(k, reducedDataTSVD, 'TSVD')

    run_hapt = assignment4()
    run_hapt.read_data_haptX('HAPT_X.csv')
    run_hapt.read_data_haptY('HAPT_Y.csv')
    run_hapt.dataSetName = 'HAPT'
    dataX = StandardScaler().fit_transform(run_hapt.allFeatures)

    run_hapt.kNum = range(1, 20, 5)
    run_hapt.pcaDims = range(1, 561, 25)
    run_hapt.icaDims = range(1, 561, 25)
    run_hapt.rpDims = range(1, 561, 25)
    run_hapt.tvsdDims = range(1, 561, 25)

    #run_hapt.k_mean_cluster()
    run_hapt.expectation_maximization()

    run_hapt.PCA()
    run_hapt.ICA()
    run_hapt.RP()
    run_hapt.TSVD()

    pcaCom = 15
    icaCom = 15
    rpCom = 15
    tsvdCom = 15
    k = 2
    reducedDataPCA = PCA(n_components=pcaCom,
                         random_state=5).fit_transform(dataX)
    run_hapt.k_mean_cluster_reduced(k, reducedDataPCA, 'PCA')
    run_hapt.expectation_maximization_reduced(k, reducedDataPCA, 'PCA')

    reducedDataICA = FastICA(n_components=icaCom,
                             random_state=5).fit_transform(dataX)
    run_hapt.k_mean_cluster_reduced(k, reducedDataICA, 'ICA')
    run_hapt.expectation_maximization_reduced(k, reducedDataICA, 'ICA')

    reducedDataRP = SparseRandomProjection(n_components=rpCom,
                                           random_state=5).fit_transform(dataX)
    run_hapt.k_mean_cluster_reduced(k, reducedDataRP, 'RP')
    run_hapt.expectation_maximization_reduced(k, reducedDataRP, 'RP')

    reducedDataTSVD = TruncatedSVD(random_state=5,
                                   n_components=tsvdCom).fit_transform(dataX)
    run_hapt.k_mean_cluster_reduced(k, reducedDataTSVD, 'TSVD')
    run_hapt.expectation_maximization_reduced(k, reducedDataTSVD, 'TSVD')

    print("All done")
    plt.show()
示例#10
0
def sparseRP(data):
    rp = SparseRandomProjection(n_components=new_dimension)
    return rp.fit_transform(data)
def _get_projection(n_samples, n_features, density='auto', eps=0.1):
    p = SparseRandomProjection(density=density, eps=eps)
    mat = csr_matrix((n_samples, n_features))
    return p.fit(mat)
示例#12
0
                          random_state=42,
                          max_iter=1000,
                          tol=.008)
            ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
            ica2_results_test = ica.transform(test)

            # GRP
            grp = GaussianRandomProjection(n_components=n_grp,
                                           eps=0.1,
                                           random_state=42)
            grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
            grp_results_test = grp.transform(test)

            # SRP
            srp = SparseRandomProjection(n_components=n_srp,
                                         dense_output=True,
                                         random_state=42)
            srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
            srp_results_test = srp.transform(test)

            # save columns list before adding the decomposition components
            usable_columns = list(set(train.columns) - set(['y']))

            # Append decomposition components to datasets
            print("Append PCA components to datasets...")
            for i in range(1, n_pca + 1):
                train['pca_' + str(i)] = pca2_results_train[:, i - 1]
                test['pca_' + str(i)] = pca2_results_test[:, i - 1]

            print("Append ICA components to datasets...")
            for i in range(1, n_ica + 1):
示例#13
0
文件: feature.py 项目: zgcgreat/WSDM
def gen_features(train, val, test):
    train = pd.DataFrame(train)
    val = pd.DataFrame(val)
    test = pd.DataFrame(test)
    # cat_cols = ['city', 'bd', 'gender', 'registered_via', 'registration_init_year',
    #              'registration_init_month', 'registration_init_date', 'payment_method_id', 'payment_plan_days',
    #              'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 'is_cancel',
    #              'transaction_date_year', 'transaction_date_month', 'transaction_date_date',
    #              'membership_expire_date_year',
    #              'membership_expire_date_month', 'membership_expire_date_date', 'membership_transaction_gap',
    #              'cancel_times',
    #              'auto_renew_count', 'plan_net_worth', 'user_date_year', 'user_date_month',
    #              'user_date_date']
    # con_cols = [x for x in train.columns if x not in cat_cols and x not in ['msno', 'is_churn']]
    # train[cat_cols] = train[cat_cols].astype('object')
    # test[cat_cols] = test[cat_cols].astype('object')
    # val[cat_cols] = val[cat_cols].astype('object')
    #
    # for col in cat_cols:
    #     train[col].fillna(value=train[col].mode()[0], inplace=True)
    #     test[col].fillna(value=test[col].mode()[0], inplace=True)
    #     val[col].fillna(value=val[col].mode()[0], inplace=True)
    # for col in con_cols:
    #     train[col].fillna(value=train[col].mean(), inplace=True)
    #     test[col].fillna(value=test[col].mean(), inplace=True)
    #     val[col].fillna(value=val[col].mean(), inplace=True)
    #
    # for c in train.columns:
    #     if train[c].dtype == 'object':
    #         lbl = LabelEncoder()
    #         lbl.fit(list(train[c].values) + list(test[c].values))
    #         train[c] = lbl.transform(list(train[c].values))
    #         test[c] = lbl.transform(list(test[c].values))

    n_comp = 15

    drop_list = []
    test_drop_list = []

    print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape)
    print('tSVD', datetime.now() - start)
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp)
    tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1))
    tsvd_results_val= tsvd.transform(val.drop(test_drop_list, axis=1))
    tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1))

    print('PCA', datetime.now() - start)
    # PCA
    pca = PCA(n_components=n_comp)
    pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1))
    pca2_results_val = pca.transform(val.drop(test_drop_list, axis=1))
    pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1))

    print('ICA', datetime.now() - start)
    # ICA
    ica = FastICA(n_components=n_comp, max_iter=10000)
    ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1))
    ica2_results_val = ica.transform(val.drop(test_drop_list, axis=1))
    ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1))

    print('GRP', datetime.now() - start)
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1)
    grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1))
    grp_results_val = grp.transform(val.drop(test_drop_list, axis=1))
    grp_results_test = grp.transform(test.drop(test_drop_list, axis=1))

    print('SRP', datetime.now() - start)
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True)
    srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1))
    srp_results_val = srp.transform(val.drop(test_drop_list, axis=1))
    srp_results_test = srp.transform(test.drop(test_drop_list, axis=1))

    # MCA
    # res_mca = MCA(train, ncp=n_comp, graph = FALSE)

    # save columns list before adding the decomposition components

    usable_columns = list(set(train.columns) - set(drop_list))

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        val['pca_' + str(i)] = pca2_results_val[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        val['ica_' + str(i)] = ica2_results_val[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        val['tsvd_' + str(i)] = tsvd_results_val[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        val['grp_' + str(i)] = grp_results_val[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        val['srp_' + str(i)] = srp_results_val[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    return train, val, test
示例#14
0
文件: feature.py 项目: zgcgreat/WSDM
def gen_feature(train, test):
    train = pd.DataFrame(train)
    test = pd.DataFrame(test)

    n_comp = 15
    drop_list = []
    test_drop_list = []

    print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape)
    print('tSVD', datetime.now() - start)
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp)
    tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1))
    tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1))

    print('PCA', datetime.now() - start)
    # PCA
    pca = PCA(n_components=n_comp)
    pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1))
    pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1))

    print('ICA', datetime.now() - start)
    # ICA
    ica = FastICA(n_components=n_comp, max_iter=10000)
    ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1))
    ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1))

    print('GRP', datetime.now() - start)
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1)
    grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1))
    grp_results_test = grp.transform(test.drop(test_drop_list, axis=1))

    print('SRP', datetime.now() - start)
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True)
    srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1))
    srp_results_test = srp.transform(test.drop(test_drop_list, axis=1))

    # MCA
    # res_mca = MCA(train, ncp=n_comp, graph = FALSE)

    # save columns list before adding the decomposition components

    usable_columns = list(set(train.columns) - set(drop_list))

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    return train, test
示例#15
0
def _get_projection(n_samples, n_features, density="auto", eps=0.1):
    p = SparseRandomProjection()
    mat = lil_matrix((n_samples, n_features))
    return p.fit(mat)
示例#16
0
# Perform Randomized Principal Components Analysis (PCA)
from sklearn.decomposition import RandomizedPCA as RPCA
rpca = RPCA(n_components=num_components)
rpca_transformed_data_train = rpca.fit_transform(dense_trainData)
rpca_transformed_data_valid = rpca.transform(dense_validData)

# Perform Gaussian Random Projection
from sklearn.random_projection import GaussianRandomProjection as GaussRan
grp = GaussRan(n_components=num_components)
grp_transformed_data_train = grp.fit_transform(dense_trainData)
grp_transformed_data_valid = grp.transform(dense_validData)

# Perform Sparse Random Projection
from sklearn.random_projection import SparseRandomProjection as SparseRan
srp = SparseRan(n_components=num_components, random_state=0)
srp_transformed_data_train = srp.fit_transform(dense_trainData)
srp_transformed_data_valid = srp.transform(dense_validData)

# Perform classification using 1-Nearest Neighbor Classifier
from sklearn.neighbors import KNeighborsClassifier

# Create a subset grid to plot performance against numbers of components
tsvd_max = tsvd_transformed_data_train.shape[1]
plot_subset = []
length_of_plot_subset = len(plot_subset)
if tsvd_max < 101:
    spacing = super_fine_spacing
    plot_subset = []
    for j in arange(1, spacing - 1):
        plot_subset.append(j)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans

breast_cancer = pd.read_csv("./breast-cancer-wisconsin.csv")
li = list(breast_cancer)
breast_cancer = pd.DataFrame(breast_cancer.values, columns=li)

Class = li[-1]

arr = breast_cancer.values
y = arr[:, -1]
X = arr[:, 0:-1]
clusters = range(2, 15)

sp = SparseRandomProjection(n_components=4)
output = sp.fit_transform(X)

tester = em.ExpectationMaximizationTestCluster(output,
                                               y,
                                               clusters=range(2, 15),
                                               plot=False,
                                               stats=True)
silhouette_EM, vmeasure_scores = tester.run()

tester = kmtc.KMeansTestCluster(output,
                                y,
                                clusters=range(2, 15),
                                plot=False,
                                stats=True)
silhouette_kmeans, V_measure = tester.run()
示例#18
0
文件: RP.py 项目: wmoo3/CS7641-A3
#tmp.to_csv(out+'diamonds scree3.csv')

#tmp = defaultdict(dict)
#for i,dim in product(range(10),dims1):
#    rp = GaussianRandomProjection(random_state=i, n_components=dim)
#    rp.fit(diamondsX)
#    tmp[dim][i] = reconstructionError(rp, diamondsX)
#    print (dim, "scree4")
#tmp =pd.DataFrame(tmp).T
#tmp.to_csv(out+'diamonds scree4.csv')

#%% task 2

tmp = defaultdict(dict)
for i, dim in product(range(10), dims1):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr_chunked(rp.fit_transform(diamondsX),
                                           diamondsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'diamonds scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims2):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims1):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
def select_features_SparseRandomProjections(train_X, train_y, test_X, k):
    selector = SparseRandomProjection(n_components=k, random_state=42)
    selector.fit(train_X)
    train_X = selector.transform(train_X)
    test_X = selector.transform(test_X)
    return train_X, test_X
示例#20
0
# In[ ]:


def distance_correlation(X1, X2):
    assert X1.shape[0] == X2.shape[0]
    return np.corrcoef(
        pairwise_distances(X1).ravel(),
        pairwise_distances(X2).ravel())[0, 1]


# In[ ]:

tmp = defaultdict(dict)
for i, dim in product(range(10), dimensions):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = distance_correlation(rp.fit_transform(X_train), X_train)

tmp = pd.DataFrame(tmp).T
tmp.to_csv('./P2/IncomeRP_DistanceCorrelation.csv')

# In[ ]:

# Run Neural Networks
rp = SparseRandomProjection(random_state=5)
nn_results, clf = run_ann(dimensions, rp, X_train, Y_train)
nn_results.to_csv('./P2/IncomeRP_ANN.csv')

## test score
test_score = clf.score(X_test, Y_test)
print("Test Accuracy = ", test_score)
示例#21
0
X_path = '/cs/puls/Experiments/hxiao-test/feature-data.mat'
Y_path = '/cs/puls/Experiments/hxiao-test/label-data.mat'

X = loadmat(X_path)['featureData']
y = loadmat(Y_path)['labelData']

RANDOM_PROJECTION_FLAG = True

if RANDOM_PROJECTION_FLAG:
    from sklearn.random_projection import SparseRandomProjection

    print "Applying random projection to reduce dimension"
    print "Shape before: %r" % (X.shape, )

    transformer = SparseRandomProjection()
    X = transformer.fit_transform(X)
    print "Shape after: %r" % (X.shape, )


# sample subset of all the data
rng = np.random.RandomState(0)
sample_n = 10000
rows = rng.permutation(X.shape[0])[:sample_n]
X = X[rows, :]
y = y[rows, :]

# sample train and test
train_ratio = 0.8
train_n = int(sample_n*train_ratio)
示例#22
0
df_non_obj_feats['binDec'] = int10

all_data_proc = pd.concat((df_obj_feats_freq, df_non_obj_feats), axis=1)

#%%
from sklearn.decomposition import PCA, FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
n_comp = 12

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results = grp.fit_transform(all_data_proc)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results = srp.fit_transform(all_data_proc)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca_results = pca.fit_transform(all_data_proc)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica_results = ica.fit_transform(all_data_proc)
for i in range(1, n_comp+1):
    all_data_proc['pca_' + str(i)] = pca_results[:,i-1]
    all_data_proc['ica_' + str(i)] = ica_results[:, i-1]
    all_data_proc['grp_' + str(i)] = grp_results[:,i-1]
    all_data_proc['srp_' + str(i)] = srp_results[:, i-1]
示例#23
0
# In[40]:

X_agg.head()

# In[41]:

from sklearn.decomposition import FactorAnalysis

fa = FactorAnalysis(n_components=50, random_state=42)
X_fa = fa.fit_transform(X)

# In[42]:

from sklearn.random_projection import SparseRandomProjection

srp = SparseRandomProjection(n_components=50, random_state=42)
X_srp = srp.fit_transform(X)

# In[43]:

from sklearn.random_projection import GaussianRandomProjection

grp = GaussianRandomProjection(n_components=50, random_state=42, eps=0.1)
X_grp = grp.fit_transform(X)

# In[60]:

from sklearn.decomposition import PCA

pca = PCA(n_components=100, random_state=42)
X_pca = pca.fit_transform(X)
示例#24
0
class assignment4:
    def __init__(self):
        # data processing
        self.dataSetPath = './data_set/'
        self.dataSetName = ""
        self.csv_delimiter = ','
        self.data = None
        self.allFeatures = []
        self.allTarget = []

        # not used
        self.XTrain = None
        self.XTest = None
        self.YTrain = None
        self.YTest = None

        # k-mean clustering
        self.kNum = range(1, 21)
        self.kmean = None
        self.kmeanRD = None
        # expectation maximization
        self.em = None
        self.emRD = None
        # PCA
        self.pca = None
        self.pcaDims = range(1, 21)

        # ICA
        self.icaDims = range(1, 21)
        self.ica = None

        # RP
        self.rp = None
        self.rpDims = range(1, 21)

        # TSVD
        self.tsvd = None
        self.tsvdDims = range(1, 10)

    def read_data_voice(self, dataName):
        with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file:
            reader = csv.reader(file, delimiter=self.csv_delimiter)
            self.data = list(reader)
        print("Reading data set: '{}'".format(self.dataSetPath + dataName))
        print('Number of instances: {}'.format(len(self.data)))
        print('Number of attributes: {}'.format(len(self.data[0]) - 1))

    def read_data_haptX(self, dataName):
        self.data = None
        with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file:
            reader = csv.reader(file, delimiter=',')
            self.data = list(reader)

        print(len(self.data))
        for elim in self.data:
            feature = []
            for i in elim:
                feature.append(i)
            self.allFeatures.append(feature)
        print("Reading data set: '{}'".format(self.dataSetPath + dataName))
        print('Number of instances: {}'.format(len(self.allFeatures)))
        print('Number of attributes: {}'.format(len(self.allFeatures[0])))

    def read_data_haptY(self, dataName):
        self.data = None
        with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file:
            reader = csv.reader(file, delimiter=',')
            self.data = list(reader)
        for elim in self.data:
            self.allTarget.append(elim)
        print("Reading data set: '{}'".format(self.dataSetPath + dataName))
        print('Number of instances: {}'.format(len(self.allTarget)))
        print('Number of attributes: {}'.format(len(self.allTarget[0])))

        self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32)
        self.allTarget = np.asarray(self.allTarget, dtype=np.float32)
        self.allTarget = self.allTarget.ravel()

    def split_data_to_train_test(self, testSize=0.3):
        # in case the data set are very different in format
        sample_len = len(self.data[0])
        for elem in self.data:
            feature = elem[0:sample_len - 1]
            feature_vector = []
            for f in feature:
                feature_vector.append(float(f))
            self.allFeatures.append(feature_vector)
            if elem[-1] == '0':
                val = 0
            else:
                val = 1
            self.allTarget.append((float(val)))
        self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32)
        self.allTarget = np.asarray(self.allTarget, dtype=np.float32)
        self.XTrain, self.XTest, self.YTrain, self.YTest = train_test_split(
            self.allFeatures,
            self.allTarget,
            test_size=testSize,
            random_state=42)
        print(
            'Total X train data -> {}%'.format(
                int((len(self.XTrain) / len(self.data)) * 100)), 'Size:',
            len(self.XTrain))
        print(
            'Total X test data -> {}%'.format(
                int((len(self.XTest) / len(self.data)) * 100)), 'Size:',
            len(self.XTest))
        print(
            'Total Y train data -> {}%'.format(
                int((len(self.YTrain) / len(self.data)) * 100)), 'Size:',
            len(self.YTrain))
        print(
            'Total Y test data -> {}%'.format(
                int((len(self.YTest) / len(self.data)) * 100)), 'Size',
            len(self.YTest))

    def get_max_idx(self, input):
        maxVal = input[0]
        maxIdx = 0
        for i in range(1, len(input)):
            if input[i] > maxVal:
                maxIdx = i
                maxVal = input[i]
        return maxIdx

    def pairwiseDistCorr(self, X1, X2):
        assert X1.shape[0] == X2.shape[0]

        d1 = pairwise_distances(X1)
        d2 = pairwise_distances(X2)
        return np.corrcoef(d1.ravel(), d2.ravel())[0, 1]

    def k_mean_cluster(self):
        print("-" * 50)
        print('{}: K-mean clustering'.format(self.dataSetName))

        dataX = StandardScaler().fit_transform(self.allFeatures)
        scores = []
        confusionMatrix = []
        self.kmean = KMeans(random_state=5, max_iter=1000)
        for i in self.kNum:
            self.kmean.set_params(n_clusters=i)
            self.kmean.fit(dataX)
            scores.append(sm.accuracy_score(self.allTarget,
                                            self.kmean.labels_))
            confusionMatrix.append(
                sm.confusion_matrix(self.allTarget, self.kmean.labels_))
        bestScoreIdx = self.get_max_idx(scores)
        print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx]))
        print("Confusion Matrix:", confusionMatrix[bestScoreIdx])

        plt.figure()
        plt.ylabel('Accuracy')
        plt.xlabel('# of Clusters')
        plt.title('K-mean Cluster ({})'.format(self.dataSetName))

        plt.style.context('seaborn-whitegrid')
        plt.xticks(self.kNum)
        plt.plot(self.kNum, scores)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_KMEAN.png'.format(self.dataSetName))
        print("-" * 50)

    def k_mean_cluster_reduced(self, n_clusters, reduced_data, name):
        print("-" * 50)
        print('{}: K-mean clustering {}'.format(self.dataSetName, name))
        dataX = StandardScaler().fit_transform(self.allFeatures)
        self.kmeanRD = KMeans(n_clusters=n_clusters,
                              random_state=5,
                              max_iter=1000)
        self.kmeanRD.fit(reduced_data)

        print("Accuracy score:{0:.2f}".format(
            sm.accuracy_score(self.allTarget, self.kmeanRD.labels_)))
        print("Confusion Matrix:")
        print(sm.confusion_matrix(self.allTarget, self.kmeanRD.labels_))

        print("-" * 50)

    def expectation_maximization_reduced(self, n_components, reduced_data,
                                         name):
        print("-" * 50)
        print('{}: Expectation maximization {}'.format(self.dataSetName, name))

        self.emRD = GaussianMixture(n_components=n_components, random_state=5)
        self.emRD.fit(reduced_data)
        y_predict = self.emRD.predict(reduced_data)

        print("Accuracy score:{0:.2f}".format(
            sm.accuracy_score(self.allTarget, y_predict)))
        print("Confusion Matrix:")
        print(sm.confusion_matrix(self.allTarget, y_predict))
        print("-" * 50)

    def expectation_maximization(self):
        print("-" * 50)
        print('{}: Expectation maximization'.format(self.dataSetName))
        dataX = StandardScaler().fit_transform(self.allFeatures)
        scores = []
        confusionMatrix = []
        self.em = GaussianMixture(random_state=5)
        for i in self.kNum:
            self.em.set_params(n_components=i)
            self.em.fit(dataX)
            y_predict = self.em.predict(dataX)
            scores.append(sm.accuracy_score(self.allTarget, y_predict))
            confusionMatrix.append(
                sm.confusion_matrix(self.allTarget, y_predict))

        bestScoreIdx = self.get_max_idx(scores)
        print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx]))
        print("Confusion Matrix:")
        print(confusionMatrix[bestScoreIdx])

        plt.figure()
        plt.ylabel('Accuracy')
        plt.xlabel('# of Clusters')
        plt.title('Expectation Maximum Cluster ({})'.format(self.dataSetName))

        plt.style.context('seaborn-whitegrid')
        plt.xticks(self.kNum)
        plt.plot(self.kNum, scores)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_EM.png'.format(self.dataSetName))
        print("-" * 50)

    def PCA(self):
        print("-" * 50)
        print('{}: Principal component analysis '.format(self.dataSetName))

        dataX = StandardScaler().fit_transform(self.allFeatures)

        self.pca = PCA(random_state=5)
        grid = {'pca__n_components': self.pcaDims}
        mlp = MLPClassifier(max_iter=2000,
                            alpha=1e-5,
                            early_stopping=False,
                            random_state=5,
                            hidden_layer_sizes=[17] * 11)
        pipe = Pipeline([('pca', self.pca), ('NN', mlp)])
        search = GridSearchCV(pipe, grid, verbose=2, cv=5)
        search.fit(dataX, self.allTarget)

        print("Best number PCA components:", search.best_params_)

        self.pca.fit(dataX)
        var = np.cumsum(
            np.round(self.pca.explained_variance_ratio_, decimals=3) * 100)

        plt.figure()
        plt.ylabel('% Variance Explained')
        plt.xlabel('# of Features')
        plt.title('PCA Analysis ({})'.format(self.dataSetName))
        plt.xticks(self.pcaDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(var)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_PCA_VA.png'.format(self.dataSetName))

        plt.figure()
        plt.ylabel('Score')
        plt.xlabel('# of Features')
        plt.title('PCA Analysis Grid Search ({})'.format(self.dataSetName))
        plt.xticks(self.pcaDims)
        plt.ylim([0, 1])
        plt.style.context('seaborn-whitegrid')
        plt.plot(self.pcaDims, search.cv_results_['mean_test_score'])
        plt.grid()
        plt.draw()
        plt.savefig('./{}_PCA_GS.png'.format(self.dataSetName))

        print("-" * 50)

    def ICA(self):
        print("-" * 50)
        print('{}: Independent component analysis '.format(self.dataSetName))

        dataX = StandardScaler().fit_transform(self.allFeatures)
        self.ica = FastICA(random_state=5, max_iter=6000)
        # kurtosis
        kurt = []
        for dim in self.icaDims:
            self.ica.set_params(n_components=dim)
            tmp = self.ica.fit_transform(dataX)
            tmp = pd.DataFrame(tmp)
            tmp = tmp.kurt(axis=0)
            kurt.append(tmp.abs().mean())

        # grid search
        grid = {'ica__n_components': self.icaDims}
        mlp = MLPClassifier(max_iter=2000,
                            alpha=1e-5,
                            early_stopping=False,
                            random_state=5,
                            hidden_layer_sizes=[17] * 11)
        pipe = Pipeline([('ica', self.ica), ('NN', mlp)])
        search = GridSearchCV(pipe, grid, verbose=2, cv=5)
        search.fit(dataX, self.allTarget)
        print("Best number ICA components:", search.best_params_)

        plt.figure()
        plt.ylabel('Kurtosis')
        plt.xlabel('# of Features')
        plt.title('ICA Analysis ({})'.format(self.dataSetName))
        plt.xticks(self.icaDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(kurt)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_kurtosis.png'.format(self.dataSetName))

        plt.figure()
        plt.ylabel('Score')
        plt.xlabel('# of Features')
        plt.title('ICA Analysis Grid Search ({})'.format(self.dataSetName))
        plt.xticks(self.icaDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(self.icaDims, search.cv_results_['mean_test_score'])
        plt.grid()
        plt.draw()
        plt.savefig('./{}_ICA_GS.png'.format(self.dataSetName))
        print("-" * 50)

    def RP(self):
        print("-" * 50)
        print('{}: Random Projection'.format(self.dataSetName))
        dataX = StandardScaler().fit_transform(self.allFeatures)
        disCorr = []
        self.rp = SparseRandomProjection(random_state=5)
        for dim in self.rpDims:
            self.rp.set_params(n_components=dim)
            disCorr.append(
                self.pairwiseDistCorr(self.rp.fit_transform(dataX), dataX))
        print(disCorr)

        # grid search
        grid = {'rp__n_components': self.rpDims}
        mlp = MLPClassifier(max_iter=2000,
                            alpha=1e-5,
                            early_stopping=False,
                            random_state=5,
                            hidden_layer_sizes=[17] * 11)
        pipe = Pipeline([('rp', self.rp), ('NN', mlp)])
        search = GridSearchCV(pipe, grid, verbose=2, cv=5)
        search.fit(dataX, self.allTarget)
        print("Best number RP components:", search.best_params_)

        plt.figure()
        plt.ylabel('Distance')
        plt.xlabel('# of Features')
        plt.title('RP Analysis ({})'.format(self.dataSetName))
        plt.xticks(self.rpDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(disCorr)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_distance.png'.format(self.dataSetName))

        plt.figure()
        plt.ylabel('Score')
        plt.xlabel('# of Features')
        plt.title('RP Analysis Grid Search ({})'.format(self.dataSetName))
        plt.xticks(self.rpDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(search.cv_results_['mean_test_score'])
        plt.grid()
        plt.draw()
        plt.savefig('./{}_RP_GS.png'.format(self.dataSetName))
        print("-" * 50)

    def TSVD(self):
        print("-" * 50)
        print('{}: TruncatedSVD'.format(self.dataSetName))
        dataX = StandardScaler().fit_transform(self.allFeatures)
        self.tsvd = TruncatedSVD(random_state=5)

        # grid search
        grid = {'tsvd__n_components': self.tsvdDims}
        mlp = MLPClassifier(max_iter=2000,
                            alpha=1e-5,
                            early_stopping=False,
                            random_state=5,
                            hidden_layer_sizes=[17] * 11)
        pipe = Pipeline([('tsvd', self.tsvd), ('NN', mlp)])
        search = GridSearchCV(pipe, grid, verbose=2, cv=5)
        search.fit(dataX, self.allTarget)
        print("Best number TSVD components:", search.best_params_)

        self.tsvd.fit(dataX)
        var = np.cumsum(
            np.round(self.tsvd.explained_variance_ratio_, decimals=3) * 100)

        plt.figure()
        plt.ylabel('% Variance Explained')
        plt.xlabel('# of Features')
        plt.title('TSVD Analysis ({})'.format(self.dataSetName))
        plt.xticks(self.tsvdDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(var)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_TSD_VA.png'.format(self.dataSetName))

        plt.figure()
        plt.ylabel('Score')
        plt.xlabel('# of Features')
        plt.title('TSVD Analysis Grid Search ({})'.format(self.dataSetName))
        plt.xticks(self.tsvdDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(search.cv_results_['mean_test_score'])
        plt.grid()
        plt.draw()
        plt.savefig('./{}_TSVD_GS.png'.format(self.dataSetName))
        print("-" * 50)
示例#25
0
 def create_rca(k, r_state):
     return SparseRandomProjection(n_components=k, random_state=r_state)
from sklearn.neural_network import MLPClassifier
from dataTransformer import *
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from data import MNIST
from sklearn.metrics import accuracy_score
from time import time
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection


if __name__=="__main__":
    mnist = MNIST(10000)
    start = time()
    pipeline = Pipeline([('Scale', StandardScaler()), ('PCA', SparseRandomProjection(random_state=0, n_components=160)),
                         ('MLP', MLPClassifier(hidden_layer_sizes=(512, 256), alpha=0.01, verbose=1))])

    pipeline.fit(mnist.X_train, mnist.y_train)
    y_pred = pipeline.predict(mnist.X_test)
    end = time()

    print ("time used: {}s".format(end - start))
    print (accuracy_score(y_pred, mnist.y_test))
# MLPClassifier(hidden_layer_sizes=(512, 256), alpha=0.01)
示例#27
0
tmp1_ =pd.DataFrame(tmp).T
tmp1_.to_csv('Diamond_RF_error.csv')

tmp = defaultdict(dict)
dims = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(X_train2)
    tmp[dim][i] = reconstructionError(rp, X_test2)
tmp2_ =pd.DataFrame(tmp).T
tmp2_.to_csv('CreditCard_RF_error.csv')
'''
#3.RP transformation
#3.1Diamond data
dim = 9
rp = SparseRandomProjection(n_components=dim, random_state=6)
#3.1.1 Training data
DiamondX2_train = rp.fit_transform(X_train)
Diamond2_train = pd.DataFrame(
    np.hstack((DiamondX2_train, np.atleast_2d(Y_train).T)))
cols1 = list(range(Diamond2_train.shape[1]))

cols1[-1] = 'Class'
Diamond2_train.columns = cols1
Diamond2_train.to_csv('Diamond_RP_train.csv')

#3.1.2 test data
DiamondX2_test = rp.fit_transform(X_test)
Diamond2_test = pd.DataFrame(
    np.hstack((DiamondX2_test, np.atleast_2d(Y_test).T)))
cols2 = list(range(Diamond2_test.shape[1]))
print("ICA")
ica = FastICA(n_components=N_COMP, random_state=random_state)
ica_results_train = ica.fit_transform(train[flist])
ica_results_test = ica.transform(test[flist])

print("GRP")
grp = GaussianRandomProjection(n_components=N_COMP,
                               eps=0.1,
                               random_state=random_state)
grp_results_train = grp.fit_transform(train[flist])
grp_results_test = grp.transform(test[flist])

print("SRP")
srp = SparseRandomProjection(n_components=N_COMP,
                             dense_output=True,
                             random_state=random_state)
srp_results_train = srp.fit_transform(train[flist])
srp_results_test = srp.transform(test[flist])

print("Append decomposition components to datasets...")
for i in range(1, N_COMP + 1):
    train['pca_' + str(i)] = pca_results_train[:, i - 1]
    test['pca_' + str(i)] = pca_results_test[:, i - 1]

    train['ica_' + str(i)] = ica_results_train[:, i - 1]
    test['ica_' + str(i)] = ica_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
示例#29
0
                                             random_state=random_state))

# Reduce dimension to 2 with LinearDiscriminantAnalysis
lda = make_pipeline(StandardScaler(),
                    LinearDiscriminantAnalysis(n_components=2))

# Reduce dimension to 2 with NeighborhoodComponentAnalysis
nca = make_pipeline(StandardScaler(),
                    NeighborhoodComponentsAnalysis(n_components=2,
                                                   random_state=random_state))

# Reduce dimension to 2 with Sparse Random Projection [SRP]
SRP = make_pipeline(StandardScaler(),
                    SparseRandomProjection(n_components=2,
                                           density = 'auto',
                                           eps = 0.5,
                                           random_state=random_state,
                                           dense_output = False))

# Reduce dimension to 2 with MultiDimensional Scaling [MDS]                   
mds = make_pipeline(StandardScaler(),
                    MDS(n_components=2,
                        n_init=12,
                        max_iter=1200,
                        metric=True,
                        n_jobs=4,
                        random_state=random_state))

# Reduce dimension to 2 with Isomap                  
isomap = make_pipeline(StandardScaler(),
                       Isomap(n_components=2,
示例#30
0
def use_decomposed_features_as_new_df(train,
                                      test,
                                      total,
                                      n_components,
                                      use_pca=False,
                                      use_tsvd=False,
                                      use_ica=False,
                                      use_fa=False,
                                      use_grp=False,
                                      use_srp=False):
    N_COMP = n_components
    ntrain = len(train)

    print("\nStart decomposition process...")

    if use_pca:
        print("PCA")
        pca = PCA(n_components=N_COMP, random_state=42)
        pca_results = pca.fit_transform(total)
        pca_results_train = pca_results[:ntrain]
        pca_results_test = pca_results[ntrain:]

    if use_tsvd:
        print("tSVD")
        tsvd = TruncatedSVD(n_components=N_COMP, random_state=42)
        tsvd_results = tsvd.fit_transform(total)
        tsvd_results_train = tsvd_results[:ntrain]
        tsvd_results_test = tsvd_results[ntrain:]

    if use_ica:
        print("ICA")
        ica = FastICA(n_components=N_COMP, random_state=42)
        ica_results = ica.fit_transform(total)
        ica_results_train = ica_results[:ntrain]
        ica_results_test = ica_results[ntrain:]

    if use_fa:
        print("FA")
        fa = FactorAnalysis(n_components=N_COMP, random_state=42)
        fa_results = fa.fit_transform(total)
        fa_results_train = fa_results[:ntrain]
        fa_results_test = fa_results[ntrain:]

    if use_grp:
        print("GRP")
        grp = GaussianRandomProjection(n_components=N_COMP,
                                       eps=0.1,
                                       random_state=42)
        grp_results = grp.fit_transform(total)
        grp_results_train = grp_results[:ntrain]
        grp_results_test = grp_results[ntrain:]

    if use_srp:
        print("SRP")
        srp = SparseRandomProjection(n_components=N_COMP,
                                     dense_output=True,
                                     random_state=42)
        srp_results = srp.fit_transform(total)
        srp_results_train = srp_results[:ntrain]
        srp_results_test = srp_results[ntrain:]

    print("Append decomposition components together...")
    train_decomposed = np.concatenate([
        srp_results_train, grp_results_train, ica_results_train,
        pca_results_train, tsvd_results_train
    ],
                                      axis=1)
    test_decomposed = np.concatenate([
        srp_results_test, grp_results_test, ica_results_test, pca_results_test,
        tsvd_results_test
    ],
                                     axis=1)

    train_with_only_decomposed_features = pd.DataFrame(train_decomposed)
    test_with_only_decomposed_features = pd.DataFrame(test_decomposed)

    for agg_col in [
            'sum', 'var', 'mean', 'median', 'std', 'weight_count',
            'count_non_0', 'num_different', 'max', 'min'
    ]:
        train_with_only_decomposed_features[col] = train[col]
        test_with_only_decomposed_features[col] = test[col]

    # Remove any NA
    train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(
        0)
    test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(
        0)

    return train_with_only_decomposed_features, test_with_only_decomposed_features
示例#31
0
for the evalutation of LDA as dimensionality reduction and SVM as
classifier"""
# Author: Ingo Guehring

# import numpy as np
# from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import SparseRandomProjection
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.multiclass import OneVsRestClassifier

import evaluation.shared as shared
import model

# pre_reduction = TruncatedSVD(n_components=500)
PRE_REDUCTION = SparseRandomProjection(n_components=500)
CLASSIFIER = OneVsRestClassifier(SVC(probability=True))

# grid
N_COMPONENTS_RANGE = [1, 2, 4, 6, 8, 10, 12, 13]
# kernels = ['linear', 'rbf']

# old range, that turned out to be too small
# GAMMA_RANGE = np.logspace(-3, 3, 7)
# C_RANGE = np.logspace(-3, 3, 7)

# new wider range
C_RANGE = shared.C_RANGE
GAMMA_RANGE = shared.GAMMA_RANGE

# this could also be used: classifier_kernel=kernels,
示例#32
0
# # 4. Decomposition Feature
# So far I've only looked at PCA components, but most kernels look at several decomposition methods, so it may be interesting to look at t-SNE of these 10-50 components of each method instead of 1000 PCA components. Furthermore, it's interesting to see how well we can classify test/train based on this reduced feature space.
#
#

# In[ ]:

COMPONENTS = 20

# List of decomposition methods to use
methods = [
    TruncatedSVD(n_components=COMPONENTS),
    PCA(n_components=COMPONENTS),
    FastICA(n_components=COMPONENTS),
    GaussianRandomProjection(n_components=COMPONENTS, eps=0.1),
    SparseRandomProjection(n_components=COMPONENTS, dense_output=True)
]

# Run all the methods
embeddings = []
for method in methods:
    name = method.__class__.__name__
    embeddings.append(
        pd.DataFrame(method.fit_transform(total_df),
                     columns=[f"{name}_{i}" for i in range(COMPONENTS)]))
    print(f">> Ran {name}")

# Put all components into one dataframe
components_df = pd.concat(embeddings, axis=1)

# Prepare plot
示例#33
0
pca2_results_train = pca.fit_transform(X_train)
pca2_results_test = pca.transform(X_test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(X_train)
ica2_results_test = ica.transform(X_test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(X_train)
grp_results_test = grp.transform(X_test)

# SRP
srp = SparseRandomProjection(n_components=n_comp,
                             dense_output=True,
                             random_state=420)
srp_results_train = srp.fit_transform(X_train)
srp_results_test = srp.transform(X_test)

# create empty dataframes to capture extra features
extra_features_train = pd.DataFrame()
extra_features_test = pd.DataFrame()

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    extra_features_train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    extra_features_test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    extra_features_train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    extra_features_test['ica_' + str(i)] = ica2_results_test[:, i - 1]
示例#34
0
    data = load_digits().data[:500]

n_samples, n_features = data.shape
print("Embedding %d samples with dim %d using various random projections" %
      (n_samples, n_features))

n_components_range = np.array([300, 1000, 10000])
dists = euclidean_distances(data, squared=True).ravel()

# select only non-identical samples pairs
nonzero = dists != 0
dists = dists[nonzero]

for n_components in n_components_range:
    t0 = time()
    rp = SparseRandomProjection(n_components=n_components)
    projected_data = rp.fit_transform(data)
    print("Projected %d samples from %d to %d in %0.3fs" %
          (n_samples, n_features, n_components, time() - t0))
    if hasattr(rp, 'components_'):
        n_bytes = rp.components_.data.nbytes
        n_bytes += rp.components_.indices.nbytes
        print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6))

    projected_dists = euclidean_distances(projected_data,
                                          squared=True).ravel()[nonzero]

    plt.figure()
    plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu)
    plt.xlabel("Pairwise squared distances in original space")
    plt.ylabel("Pairwise squared distances in projected space")
示例#35
0
 def build_impl(self):
     self.model = SparseRandomProjection(**self.config)
示例#36
0
def optimize_embedding(data_matrix,
                       known_targets=None,
                       min_feature_ratio=.1,
                       n_iter=30,
                       n_repetitions=1):
    # case for sparse data matrix: use random projection to transform to dense
    if sp.issparse(data_matrix):
        logger.info('Convert sparse to dense')
        logger.info('Data matrix: %d rows  %d cols' %
                    (data_matrix.shape[0], data_matrix.shape[1]))
        from sklearn.random_projection import SparseRandomProjection
        data_matrix = SparseRandomProjection().fit_transform(
            data_matrix).toarray()
        logger.info('Data matrix: %d rows  %d cols' %
                    (data_matrix.shape[0], data_matrix.shape[1]))

    if known_targets is not None:
        logger.info('Feature selection')
        logger.info('Data matrix: %d rows  %d cols' %
                    (data_matrix.shape[0], data_matrix.shape[1]))
        new_data_matrix = iterated_semi_supervised_feature_selection(
            data_matrix, known_targets, min_feature_ratio=min_feature_ratio)
        if new_data_matrix.shape[1] > 2:
            data_matrix = new_data_matrix
        logger.info('Data matrix: %d rows  %d cols' %
                    (data_matrix.shape[0], data_matrix.shape[1]))

    n_instances = data_matrix.shape[0]
    opts_list = make_opts_list(n_instances, n_iter)
    # iterate n_iter times to find best parameter configuration
    best_score = 0
    logger.debug('neqs = neighborhood embedding quality score')
    for i in range(n_iter):
        random.seed(i)
        # sample from the options
        embed_opts = make_embed_opts(opts_list, n_instances)
        basis_opts = make_basis_opts(opts_list, n_instances)
        general_opts = make_general_opts()
        try:
            # find options with max quality score
            score_list = []
            for it in range(n_repetitions):
                data_matrix_lowdim,\
                    link_ids,\
                    score,\
                    scores = embed_(data_matrix,
                                    embed_opts=embed_opts,
                                    basis_opts=basis_opts,
                                    change_of_basis=general_opts['change_of_basis'])
                score_list.append(score)
            mean_reduced_score = np.mean(score_list) - np.std(score_list)
            if best_score == 0 or mean_reduced_score > best_score:
                # best_embed_opts = embed_opts
                # best_basis_opts = basis_opts
                # best_change_of_basis = change_of_basis
                best_data_matrix_lowdim = data_matrix_lowdim
                best_link_ids = link_ids
                best_scores = scores
                best_score = mean_reduced_score
                mark = '*'
            else:
                mark = ''
            logger.debug('..%.2d/%d   neqs: %.3f (%.3f +- %.3f)  %s' %
                         (i + 1, n_iter, mean_reduced_score, np.mean(scores),
                          np.std(scores), mark))
        except Exception as e:
            logger.debug('Failed iteration: %s' % e)
    return best_data_matrix_lowdim, best_link_ids, best_score, best_scores
示例#37
0
from numpy.lib.function_base import _interp_dispatcher
# from skmultiflow.trees import HoeffdingTree as HT
from skmultiflow.lazy import SAMKNN
from sklearn.metrics import accuracy_score
import time, copy
from sklearn.random_projection import SparseRandomProjection
from sklearn.metrics import cohen_kappa_score
# from skmultiflow.bayes import NaiveBayes
from inc_pca import IncPCA
from rff_base import Base as RFF
from rrslvq import ReactiveRobustSoftLearningVectorQuantization as RRSLVQ
from rslvq import RSLVQ

from skmultiflow.meta import AdaptiveRandomForest as ARF

transformer = SparseRandomProjection(n_components=1000)
classes = np.arange(0, 15, 1)

res_file = 'res_pca_skipgram.txt'
f = open(res_file, 'a+')
f.write('SKIP-GRAM\n')
f.close()
data = np.load('../dataset/skip-gram-embed-w-label.npy')

# f = open('data/nasdaq_stream_wo_sentiment.csv')
# labels = []
# while 1:
#    line = f.readline()
#    if line == '': break
#    arr = np.array(line.split(','), dtype='float64')
#    labels.append(arr[1])
示例#38
0
class STPM(pl.LightningModule):
    def __init__(self, model: torchvision.models, embedding_dir_path: str,
                 sample_path: str, input_image_size: int,
                 coreset_sampling_ratio: int, n_neighbors: int,
                 anomal_threshold: float, normalization_mean: [],
                 normalization_std: []):
        super(STPM, self).__init__()

        self.save_hyperparameters()

        self.init_features()

        # MODEL HYPERPARAMETERS
        self.input_image_size = input_image_size
        self.coreset_sampling_ratio = coreset_sampling_ratio
        self.n_neighbors = n_neighbors
        self.anomal_threshold = anomal_threshold

        self.embedding_dir_path = embedding_dir_path
        self.sample_path = sample_path

        #self.source_code_save_path = source_code_save_path

        def hook_t(module, input, output):
            self.features.append(output)

        self.model = model
        #self.model = wide_resnet50_2(pretrained=True, progress=True)
        for param in self.model.parameters():
            param.requires_grad = False

        self.model.layer2[-1].register_forward_hook(hook_t)
        self.model.layer3[-1].register_forward_hook(hook_t)

        #self.data_inv_transform= transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.255], std=[1/0.229, 1/0.224, 1/0.255])

        self.data_inv_transform = transforms.Normalize(
            mean=[
                -normalization_mean[0] / normalization_std[0],
                -normalization_mean[1] / normalization_std[1],
                -normalization_mean[2] / normalization_std[2]
            ],
            std=[
                1 / normalization_std[0], 1 / normalization_std[1],
                1 / normalization_std[2]
            ])

        # dummy loss. No Update parameters is performed
        self.criterion = torch.nn.MSELoss(reduction='sum')

        self.init_results_list()

    def init_results_list(self):
        self.img_path_list = []
        self.mean_score_norm = []
        self.all_scores = []
        self.all_scores_mean_norm = []
        self.image_batch_list = []
        self.x_type_list = []
        self.y_true = []

    def init_features(self):
        self.features = []

    def forward(self, x_t):
        self.init_features()
        _ = self.model(x_t)
        return self.features

    def save_anomaly_map(self, anomaly_map, input_img, gt_img, file_name,
                         x_type):
        if anomaly_map.shape != input_img.shape:
            anomaly_map = cv2.resize(anomaly_map,
                                     (input_img.shape[0], input_img.shape[1]))
        anomaly_map_norm = min_max_norm(anomaly_map)
        anomaly_map_norm_hm = cvt2heatmap(anomaly_map_norm * 255)

        # anomaly map on image
        heatmap = cvt2heatmap(anomaly_map_norm * 255)
        hm_on_img = heatmap_on_image(heatmap, input_img)

        # save images
        cv2.imwrite(
            os.path.join(self.sample_path, f'{x_type}_{file_name}.jpg'),
            input_img)
        cv2.imwrite(
            os.path.join(self.sample_path, f'{x_type}_{file_name}_amap.jpg'),
            anomaly_map_norm_hm)
        cv2.imwrite(
            os.path.join(self.sample_path,
                         f'{x_type}_{file_name}_amap_on_img.jpg'), hm_on_img)

    def configure_optimizers(self):
        return None

    def on_train_start(self):
        self.model.eval()  # to stop running_var move (maybe not critical)
        self.embedding_list = []

    def on_test_start(self):
        self.init_results_list()

        self.embedding_coreset = pickle.load(
            open(os.path.join(self.embedding_dir_path, 'embedding.pickle'),
                 'rb'))
        embeded = torch.tensor(self.embedding_coreset)
        train_jit = TrainFeature(embeded)
        traced_model = torch.jit.script(train_jit)
        torch.jit.save(traced_model, "patchcore_features.pt")

    def training_step(self, batch,
                      batch_idx):  # save locally aware patch features
        x, _, file_name, _ = batch
        features = self(x)
        embeddings = []
        for feature in features:
            m = torch.nn.AvgPool2d(3, 1, 1)
            embeddings.append(m(feature))
        embedding = embedding_concat(embeddings[0], embeddings[1])
        self.embedding_list.extend(reshape_embedding(np.array(embedding)))
        gc.collect()

    def training_epoch_end(self, outputs):
        total_embeddings = np.array(self.embedding_list)
        # Random projection
        self.randomprojector = SparseRandomProjection(
            n_components='auto',
            eps=0.9)  # 'auto' => Johnson-Lindenstrauss lemma
        self.randomprojector.fit(total_embeddings)
        # Coreset Subsampling
        selector = kCenterGreedy(total_embeddings, 0, 0)
        selected_idx = selector.select_batch(
            model=self.randomprojector,
            already_selected=[],
            N=int(total_embeddings.shape[0] *
                  float(self.coreset_sampling_ratio)))
        self.embedding_coreset = total_embeddings[selected_idx]

        print('initial embedding size : ', total_embeddings.shape)
        print('final embedding size : ', self.embedding_coreset.shape)
        with open(os.path.join(self.embedding_dir_path, 'embedding.pickle'),
                  'wb') as f:
            pickle.dump(self.embedding_coreset, f)
        gc.collect()

    def test_step(self, batch, batch_idx):  # Nearest Neighbour Search

        x, label, file_name, x_type = batch
        features = self(x)
        embeddings = []
        for feature in features:
            m = torch.nn.AvgPool2d(3, 1, 1)
            embeddings.append(m(feature))
        embedding_ = embedding_concat(embeddings[0], embeddings[1])
        embedding_test = np.array(reshape_embedding(np.array(embedding_)))

        # NN
        knn = KNN(torch.from_numpy(self.embedding_coreset).cuda(),
                  k=self.n_neighbors)
        score_patches = knn(
            torch.from_numpy(embedding_test).cuda())[0].cpu().detach().numpy()
        self.img_path_list.extend(file_name)
        # support multi input size

        block_size = int(np.sqrt(len(score_patches)))
        anomaly_map = score_patches[:, 0].reshape((block_size, block_size))
        self.all_scores.append(anomaly_map)
        self.image_batch_list.append(x)
        self.x_type_list.append(x_type)
        self.y_true.append(label.cpu().numpy()[0])

    def Find_Optimal_Cutoff(self, target, predicted):
        fpr, tpr, threshold = roc_curve(target, predicted, pos_label=1)
        i = np.arange(len(tpr))
        roc = pd.DataFrame({
            'tf': pd.Series(tpr - (1 - fpr), index=i),
            'threshold': pd.Series(threshold, index=i)
        })
        roc_t = roc.iloc[(roc.tf - 0).abs().argsort()[:1]]
        return list(roc_t['threshold']), threshold
        '''
        plt.plot(fpr, tpr)
        plt.plot([0, 1], [0, 1], '--', color='black')  
        plt.title('ROC Curve')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.show()
        '''

    def analyze_data(self):
        score_pathces = np.array(self.all_scores)
        for i, val in enumerate(score_pathces):
            self.all_scores_mean_norm.append(np.mean(val))

        min_score = np.min(score_pathces)
        max_score = np.max(score_pathces)

        print("MIN SCORE {}".format(min_score))
        print("MAX SCORE {}".format(max_score))

        scores = (score_pathces - min_score) / (max_score - min_score)
        for i, heatmap in enumerate(scores):
            anomaly_map_resized = cv2.resize(
                heatmap, (self.input_image_size, self.input_image_size))
            max_ = np.max(heatmap)
            min_ = np.min(heatmap)

            anomaly_map_resized_blur = gaussian_filter(anomaly_map_resized,
                                                       sigma=4)
            anomaly_map_resized_blur[0][0] = 1.

            # save images
            x = self.image_batch_list[i]
            x = self.data_inv_transform(x)
            input_x = cv2.cvtColor(
                x.permute(0, 2, 3, 1).cpu().numpy()[0] * 255,
                cv2.COLOR_BGR2RGB)
            if anomaly_map_resized_blur.shape != input_x.shape:
                anomaly_map_resized_blur = cv2.resize(
                    anomaly_map_resized_blur,
                    (input_x.shape[0], input_x.shape[1]))

            if self.anomal_threshold != 0:
                anomaly_threshold_index = anomaly_map_resized_blur[
                    anomaly_map_resized_blur > self.anomal_threshold]
                anomaly_map_resized_blur[
                    anomaly_map_resized_blur < self.anomal_threshold] = 0
                anomaly_threshold_area = anomaly_threshold_index.size
                anomaly_threshold_area = anomaly_threshold_area / \
                    float(anomaly_map_resized_blur.size) * 100.
                self.all_scores_mean_norm[i] = anomaly_threshold_area

            # anomaly map on image
            heatmap = cvt2heatmap(anomaly_map_resized_blur * 255)
            hm_on_img = heatmap_on_image(heatmap, input_x)

            # save images
            cv2.imwrite(
                os.path.join(
                    self.sample_path,
                    f'{self.x_type_list[i]}_{self.img_path_list[i]}.jpg'),
                input_x)
            cv2.imwrite(
                os.path.join(
                    self.sample_path,
                    f'{self.x_type_list[i]}_{self.img_path_list[i]}_amap.jpg'),
                heatmap)
            cv2.imwrite(
                os.path.join(
                    self.sample_path,
                    f'{self.x_type_list[i]}_{self.img_path_list[i]}_amap_on_img.jpg'
                ), hm_on_img)

    def test_epoch_end(self, outputs):
        self.analyze_data()

        best_th, threshold = self.Find_Optimal_Cutoff(
            self.y_true, self.all_scores_mean_norm)
        print(f'\nbest threshold={best_th}')
        ng_index = np.where(np.array(self.y_true) == 1)
        if len(ng_index[0]) == 0:
            ng_index = len(self.y_true)
        else:
            ng_index = ng_index[0][0]
        fig = plt.figure()
        sns.histplot(self.all_scores_mean_norm[:ng_index],
                     kde=True,
                     color="blue",
                     label="normal")
        sns.histplot(self.all_scores_mean_norm[ng_index:],
                     kde=True,
                     color="red",
                     label="abnormal")
        fig.legend(labels=['normal', 'abnormal'])
        plt.xlabel("Anomaly score")
        plt.ylabel("Count")
        plt.savefig('Anomaly_score_histplot.jpg')
示例#39
0
 processor.latext_start_figure()
 X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans')
 n_clusters = len(dataset.label_encoder.classes_)
 pca = PCA(n_components=0.95)
 pca.fit(X_train)
 n_components = pca.components_.shape[0]
 print(f"n_components: {n_components}")
 dr_models = [
     PCA(n_components=n_components, random_state=0),
     FastICA(n_components=n_components, random_state=0),
     MiniBatchDictionaryLearning(n_components=n_components,
                                 alpha=1,
                                 batch_size=200,
                                 n_iter=10,
                                 random_state=0),
     SparseRandomProjection(random_state=0, n_components=n_components)
 ]
 clustering_models = [
     KMeans(n_clusters=n_clusters,
            init='k-means++',
            n_init=10,
            max_iter=600,
            random_state=0,
            tol=0.0001),
     GaussianMixture(n_components=n_clusters,
                     n_init=10,
                     max_iter=600,
                     random_state=0,
                     tol=0.0001)
 ]
 for pca in dr_models:
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod[target_column].replace(map_to_int, inplace=True)
    return (df_mod, map_to_int)

if __name__ == "__main__":
    mushroom_data = pd.read_csv("mushroom_data.csv")
    dft, mapping = encode_target(mushroom_data, "class")
    dft.to_csv('mushroom_datanew.cvs')
    X = (dft.ix[:,:-1])
    y = dft.ix[:, -1]
    
    #randomized projection
    tmp = defaultdict(dict)
    dims = range(1, 22)
    for i, dim in product(range(20), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X)
    tmp = pd.DataFrame(tmp).T
    tmp
    tmp.to_csv('rp_mushroom_iterations.csv')
    
    
    tmp_fit = defaultdict(dict)
    for i,dim in product(range(20),dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(X)  
        tmp_fit[dim][i] = reconstructionError(rp, X)
    tmp_fit =pd.DataFrame(tmp_fit).T
    tmp_fit
    tmp_fit.to_csv('rp_mushroom_new_data.csv')
	grid ={'rp__n_components':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch}
示例#41
0
##NEURAL NETWORK
from sklearn.neural_network import MLPClassifier
#LEARNIGN CURVE PLOT
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.cross_validation import train_test_split


X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)

###PCA
##X_train = PCA(n_components=3).fit_transform(X_train)
##X_test = PCA(n_components=3).fit_transform(X_test)

####RP
X_train = SparseRandomProjection(n_components=3).fit_transform(X_train)
X_test = SparseRandomProjection(n_components=3).fit_transform(X_test)

mlp = MLPClassifier(activation='logistic', solver='adam', max_iter=260)
mlp.fit(X_train, y_train)  
nn_pred = mlp.predict(X_test)


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
示例#42
0
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]

    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]

    train['tsvd_' + str(i)] = tsvd_results_train[:,i-1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i-1]

    train['grp_' + str(i)] = grp_results_train[:,i-1]
tsvd = TruncatedSVD(n_components=n_components, random_state=420)
df_tsvd = pd.DataFrame(tsvd.fit_transform(train), columns=columns)
df_test_tsvd = pd.DataFrame(tsvd.transform(test), columns=columns)

# GRP
columns = ['GRP_{}'.format(i) for i in range(n_components)]
grp = GaussianRandomProjection(n_components=n_components,
                               eps=0.1,
                               random_state=420)
df_grp = pd.DataFrame(grp.fit_transform(train), columns=columns)
df_test_grp = pd.DataFrame(grp.transform(test), columns=columns)

# SRP
columns = ['SRP_{}'.format(i) for i in range(n_components)]
srp = SparseRandomProjection(n_components=n_components,
                             dense_output=True,
                             random_state=420)
df_srp = pd.DataFrame(srp.fit_transform(train), columns=columns)
df_test_srp = pd.DataFrame(srp.transform(test), columns=columns)

train = pd.concat([train, df_pca, df_ica, df_tsvd, df_grp, df_srp], axis=1)
test = pd.concat(
    [test, df_test_pca, df_test_ica, df_test_tsvd, df_test_grp, df_test_srp],
    axis=1)

### FEATURE SELECTION ###
# f_regression
#f_sel = SelectKBest(score_func = f_regression, k = 'all')
#train_red = pd.DataFrame(f_sel.fit_transform(train, y))
#f_scores = pd.Series(f_sel.scores_)
#pvalues = pd.Series(f_sel.pvalues_)
    data = load_digits().data[:500]

n_samples, n_features = data.shape
print("Embedding %d samples with dim %d using various random projections"
      % (n_samples, n_features))

n_components_range = np.array([300, 1000, 10000])
dists = euclidean_distances(data, squared=True).ravel()

# select only non-identical samples pairs
nonzero = dists != 0
dists = dists[nonzero]

for n_components in n_components_range:
    t0 = time()
    rp = SparseRandomProjection(n_components=n_components)
    projected_data = rp.fit_transform(data)
    print("Projected %d samples from %d to %d in %0.3fs"
          % (n_samples, n_features, n_components, time() - t0))
    if hasattr(rp, 'components_'):
        n_bytes = rp.components_.data.nbytes
        n_bytes += rp.components_.indices.nbytes
        print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6))

    projected_dists = euclidean_distances(
        projected_data, squared=True).ravel()[nonzero]

    plt.figure()
    plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu)
    plt.xlabel("Pairwise squared distances in original space")
    plt.ylabel("Pairwise squared distances in projected space")
示例#45
0
kurtosis = collections.defaultdict(list)
for i in range(1, num_components + 1):
    kurtosis['num components'].append(i)
    ica = FastICA(n_components=i)
    ica_transformed_data = ica.fit_transform(X_default_train)
    kurtosis['avg kurtosis'].append(
        pd.DataFrame(data=ica_transformed_data).kurt(axis=0).abs().mean())
kurtosis_df = pd.DataFrame(data=kurtosis)
kurtosis_df.to_csv('default_avg_kurtosis.csv')

num_components = 16
rp_stats = collections.defaultdict(list)
for i in range(1, num_components):
    rp_stats['num components'].append(i)
    rp = SparseRandomProjection(n_components=i)
    nnm = MLPClassifier()
    rp_nnm = Pipeline([('rp', rp), ('nnm', nnm)])
    rp_nnm.fit(X_digits_train, y_digits_train)
    accuracy_score = metrics.accuracy_score(rp_nnm.predict(X_digits_test),
                                            y_digits_test)
    rp_stats['accuracy score'].append(accuracy_score)
rp_df = pd.DataFrame(data=rp_stats)
rp_df.to_csv('digits_rp_data.csv')

num_components = 23
rp_stats = collections.defaultdict(list)
for i in range(1, num_components):
    rp_stats['num components'].append(i)
    rp = SparseRandomProjection(n_components=i)
    nnm = MLPClassifier()
示例#46
0
f = open('../data/article_text.p', 'wb')
cPickle.dump(articles, f, protocol=-1)

print "saving done"

print len(articles)

vec = TfidfVectorizer(max_df=0.8, sublinear_tf=True)

X = vec.fit_transform(articles)


print X.shape

proj = SparseRandomProjection()

X = proj.fit_transform(X)

print X.shape

sparse_save(X,"../data/tfidf.h5")

# f = open('X_data.p', 'wb')
# cPickle.dump(X.data, f, protocol=-1)
# f = open('X_indices.p', 'wb')
# cPickle.dump(X.indices, f, protocol=-1)
# f = open('X_indptr.p', 'wb')
# cPickle.dump(X.indptr, f, protocol=-1)

#X = normalize(X)
# Part 2: perform sparse random projection of the faces dataset

faces_data = fetch_olivetti_faces().data
n_samples, n_features = faces_data.shape
print "Embedding %d faces with dim %d using various random projections" % (
    n_samples, n_features)

n_components_range = np.array([50, 200, 1000])
dists = euclidean_distances(faces_data, squared=True).ravel()

# select only non-identical samples pairs
nonzero = dists != 0
dists = dists[nonzero]

for n_components in n_components_range:
    rp = SparseRandomProjection(n_components=n_components)
    projected_data = rp.fit_transform(faces_data)
    projected_dists = euclidean_distances(
        projected_data, squared=True).ravel()[nonzero]

    pl.figure()
    pl.hexbin(dists, projected_dists, gridsize=100)
    pl.xlabel("Pairwise squared distances in original space")
    pl.ylabel("Pairwise squared distances in projected space")
    pl.title("Pairwise distances distribution for n_components=%d" %
             n_components)
    cb = pl.colorbar()
    cb.set_label('Sample pairs counts')

    rates = projected_dists / dists