Пример #1
0
def data_reduction(topic_2_doc, n_clusters):
    reduced_data = PCA(n_components=2).fit_transform(topic_2_doc)
    kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=4)
    kmeans.fit(reduced_data.astype("float"))
    # step size of the mesh
    h = 0.005
    # plot the decision boundary
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
    # return
    return reduced_data, kmeans, x_min, x_max, y_min, y_max, xx, yy, Z
Пример #2
0
def pca_reduction(features, n_components=0.8, svd_solver='full'):
    """Apply PCA dimensionality reduction.

  Args:
    features: matrix of dimension [N, D], where N is the number of datapoints
      and D the feature dimensionality to reduce.
    n_components: if > 1 reduce the dimensionlity of the features to this value,
      if 0 < n_components < 1, select the number of components such that the
      percentage of variance explained is greater than (n_components * 100).
    svd_solver: SVD solver to use. As default we compute the exact full SVD.

  Returns:
    reduced_features: matrix [N, K] of features with reduced dimensionality K.

  """

    reduced_feature = PCA(n_components=n_components,
                          svd_solver=svd_solver).fit_transform(features)
    return reduced_feature.astype(np.float32)
X_train = X_train.as_matrix()
y_train = y_train.as_matrix()[:, -1]

########################### Test Set ######################
##This is a seperate test file: used for final model validation
final_test = pd.read_csv('adult_test.csv')
final_test_y = final_test['target']
final_test_x = final_test.drop(['target', 'native-country'], axis=1)
final_test_y = pd.get_dummies(final_test_y)
final_test_x = pd.get_dummies(final_test_x)
y_test = final_test_y.as_matrix()[:, 1]
X_test = final_test_x[column_name]

######################################################## PCA #################################################
from sklearn.decomposition import PCA
X_train = X_train.astype('float32')
pca = PCA(n_components=X_train.shape[1])
pca.fit(X_train)
X_train = pca.transform(X_train)

X_test = X_test.astype('float32')
pca = PCA(n_components=X_train.shape[1])
pca.fit(X_test)
X_test = pca.transform(X_test)

############################################ Neural Networks ################################################
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='sgd',
                    activation='logistic',
                    alpha=1e-5,
                    random_state=1)
Пример #4
0
                            help='GPU device to be used.')
    args = parser.parse_args()

    # Load model and image list
    images = get_dataset_images(args.img_dir)
    if len(images) == 0:
        print('Could not find any images. Have you set --img_dir correctly?')
        exit()
    if args.gpu is not None:
        caffe.set_mode_gpu()
        caffe.set_device(args.gpu)
    else:
        caffe.set_mode_cpu()
    net = caffe.Net(args.model, caffe.TEST, weights=args.weights)
    mean = np.loadtxt(args.mean)

    # Extract features
    features = np.array([
        extract_cnn_features(net, mean, img, 'fc6') for img in tqdm(
            images, desc='Extracting CNN features...', dynamic_ncols=True)
    ],
                        dtype=np.float32)

    # Norm, PCA, Norm
    features /= np.linalg.norm(features, axis=1, keepdims=True)
    features = PCA(args.pca_dim).fit_transform(features)
    features /= np.linalg.norm(features, axis=1, keepdims=True)

    # Save features
    np.save(args.feature_dump, features.astype(np.float32, copy=False))