def sandwich_demo(): x, y = sandwich_data() knn = nearest_neighbors(x, k=2) ax = pyplot.subplot(3, 1, 1) # take the whole top row plot_sandwich_data(x, y, ax) plot_neighborhood_graph(x, knn, y, ax) ax.set_title('input space') ax.set_aspect('equal') ax.set_xticks([]) ax.set_yticks([]) num_constraints = 60 mls = [(LMNN(), (x, y)), (ITML(), (x, ITML.prepare_constraints(y, len(x), num_constraints))), (SDML(), (x, SDML.prepare_constraints(y, len(x), num_constraints))), (LSML(), (x, LSML.prepare_constraints(y, num_constraints)))] for ax_num, (ml, args) in zip(xrange(3, 7), mls): ml.fit(*args) tx = ml.transform() ml_knn = nearest_neighbors(tx, k=2) ax = pyplot.subplot(3, 2, ax_num) plot_sandwich_data(tx, y, ax) plot_neighborhood_graph(tx, ml_knn, y, ax) ax.set_title('%s space' % ml.__class__.__name__) ax.set_xticks([]) ax.set_yticks([]) pyplot.show()
def sdml_fit(samples, similarity_set, prior='covariance', balance_param=0.15): """Prior can be 'covariance', 'identity' or 'random'. balance_param was used 0.5 in the first version of the paper, but it does not work here with such a large value. """ n_samples = len(similarity_set) sdml = SDML(prior=prior, preprocessor=samples, verbose=True, balance_param=balance_param) pairs, Y = [], [] for ind1 in range(n_samples): for ind2 in range(n_samples): pairs.append([ind1, ind2]) if similarity_set[ind1, ind2]: Y.append(1) else: Y.append(-1) start = time() sdml.fit(pairs, Y) print("Fitting took {:.2f} seconds.".format(time() - start)) return sdml
def test_sdml_converges_if_psd(self): """Tests that sdml converges on a simple problem where we know the pseudo-covariance matrix is PSD""" pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]]) y = [1, -1] sdml = SDML(use_cov=True, sparsity_param=0.01, balance_param=0.5) sdml.fit(pairs, y) assert np.isfinite(sdml.get_mahalanobis_matrix()).all()
def update(request): df_label = pd.read_csv( os.path.join(settings.BASE_DIR, 'data/outcome_labels.csv')) print("df_label", '\n', df_label) df_data = pd.read_csv( os.path.join(settings.BASE_DIR, 'data/features_rep.csv')) #print("df_data", df_data) #get unique row ids rowIDLIst = pd.concat([df_label.id1, df_label.id2], axis=0).unique().tolist() #rowIDLIst2 = pd.concat([df_label.id1,df_label.id2],axis = 1).unique().tolist() print("rowIDLIst", '\n', rowIDLIst) #print("rowIDLIst2",'\n', rowIDLIst2) #connectivity graph cmatrix = np.zeros([len(rowIDLIst), len(rowIDLIst)]) print("as_Matrix", '\n', df_label.as_matrix) for lbl in df_label.as_matrix(): print("lbl", lbl) print("lbl[0]", lbl[0]) print("lbl[1]", lbl[1]) print("lbl[2]", lbl[2]) print("rowIDLIst.index(lbl[0])", rowIDLIst.index(lbl[0]), "rowIDLIst.index(lbl[1])", rowIDLIst.index(lbl[1])) cmatrix[rowIDLIst.index(lbl[0])][rowIDLIst.index(lbl[1])] = int(lbl[2]) cmatrix[rowIDLIst.index(lbl[1])][rowIDLIst.index(lbl[0])] = int(lbl[2]) print("cmatrix", '\n', cmatrix) trainedData = [] for rid in rowIDLIst: row = df_data.iloc[[rid]] trainedData.append(row) print("trainedData1", '\n', trainedData) trainedData = pd.concat(trainedData, axis=0).as_matrix() print("trainedData2" "\n", trainedData) metric = SDML().fit(trainedData, cmatrix) newData = metric.transform(df_data) al_selection = request.session['clustering'] num_clustering = request.session['num_cluster'] clusteringAndTSNE(newData, al_selection, num_clustering) # context is a dict of html code, containing three types of features representation content = {'Title': "Step 7: Clustering Visualization", "listId": "li7"} return render(request, 'clustering/stp7-clu-visualisation.html', content)
def test_verbose_has_not_installed_skggm_sdml(capsys): # Test that if users have installed skggm, a message is printed telling them # skggm's solver is used (when they use SDML) # TODO: remove if we don't need skggm anymore pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]]) y_pairs = [1, -1] sdml = SDML(verbose=True) sdml.fit(pairs, y_pairs) out, _ = capsys.readouterr() assert "SDML will use scikit-learn's graphical lasso solver." in out
def test_iris(self): num_constraints = 1500 n = self.iris_points.shape[0] np.random.seed(1234) W = SDML.prepare_constraints(self.iris_labels, n, num_constraints) # Test sparse graph inputs. for graph in ((W, scipy.sparse.csr_matrix(W))): sdml = SDML().fit(self.iris_points, graph) csep = class_separation(sdml.transform(), self.iris_labels) self.assertLess(csep, 0.25)
def update(): df_label = pd.read_csv('data/outcome_labels.csv') print "df_label", '\n', df_label df_data = pd.read_csv('data/features_rep.csv') #df_data = pd.read_csv('data/alvin_rep.csv') print "df_data", '\n', df_data #print("df_data", df_data) #get unique row ids rowIDLIst = pd.concat([df_label.id1, df_label.id2], axis=0).unique().tolist() #rowIDLIst2 = pd.concat([df_label.id1,df_label.id2],axis = 1) print "rowIDLIst", '\n', rowIDLIst #print("rowIDLIst2",'\n', rowIDLIst2) #connectivity graph cmatrix = np.zeros([len(rowIDLIst), len(rowIDLIst)]) #print("as_Matrix", '\n', df_label.as_matrix) for lbl in df_label.as_matrix(): #print ("lbl",lbl) #print ("lbl[0]",lbl[0]) #print ("lbl[1]",lbl[1]) #print ("lbl[2]",lbl[2]) #print ("rowIDLIst.index(lbl[0])", rowIDLIst.index(lbl[0]),"rowIDLIst.index(lbl[1])",rowIDLIst.index(lbl[1])) cmatrix[rowIDLIst.index(lbl[0])][rowIDLIst.index(lbl[1])] = int(lbl[2]) cmatrix[rowIDLIst.index(lbl[1])][rowIDLIst.index(lbl[0])] = int(lbl[2]) print "cmatrixShape", '\n', cmatrix.shape trainedData = [] for rid in rowIDLIst: row = df_data.iloc[[rid]] #print "row","\n",row #print "rowType","\n",type(row) trainedData.append(row) #print "trainedData","\n", trainedData #print "typetrainedData1", '\n', len(trainedData) trainedData = pd.concat(trainedData, axis=0).as_matrix() #print "trainedData2", "\n", trainedData print "trainedData.shape", '\n', trainedData.shape metric = SDML().fit(trainedData, cmatrix) newData = metric.transform(df_data) clusteringAndTSNE(newData)
def test_iris(self): num_constraints = 1500 n = self.iris_points.shape[0] # Note: this is a flaky test, which fails for certain seeds. # TODO: un-flake it! np.random.seed(5555) W = SDML.prepare_constraints(self.iris_labels, n, num_constraints) # Test sparse graph inputs. for graph in ((W, scipy.sparse.csr_matrix(W))): sdml = SDML().fit(self.iris_points, graph) csep = class_separation(sdml.transform(), self.iris_labels) self.assertLess(csep, 0.25)
def test_sdml_raises_warning_msg_installed_skggm(self): """Tests that the right warning message is raised if someone tries to use SDML but has not installed skggm, and that the algorithm fails to converge""" # TODO: remove if we don't need skggm anymore # case on which we know that skggm's graphical lasso fails # because it will return non finite values pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]]) y_pairs = [1, -1] sdml = SDML(use_cov=False, balance_param=100, verbose=True) msg = ("There was a problem in SDML when using skggm's graphical " "lasso solver.") with pytest.raises(RuntimeError) as raised_error: sdml.fit(pairs, y_pairs) assert msg == str(raised_error.value)
def test_raises_no_warning_installed_skggm(self): # otherwise we should be able to instantiate and fit SDML and it # should raise no error and no ConvergenceWarning pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]]) y_pairs = [1, -1] X, y = make_classification(random_state=42) with pytest.warns(None) as records: sdml = SDML(prior='covariance') sdml.fit(pairs, y_pairs) for record in records: assert record.category is not ConvergenceWarning with pytest.warns(None) as records: sdml_supervised = SDML_Supervised(prior='identity', balance_param=1e-5) sdml_supervised.fit(X, y) for record in records: assert record.category is not ConvergenceWarning
def sandwich_demo(): x, y = sandwich_data() knn = nearest_neighbors(x, k=2) ax = pyplot.subplot(3, 1, 1) # take the whole top row plot_sandwich_data(x, y, ax) plot_neighborhood_graph(x, knn, y, ax) ax.set_title('input space') ax.set_aspect('equal') ax.set_xticks([]) ax.set_yticks([]) num_constraints = 60 mls = [ (LMNN(), (x, y)), (ITML(), (x, ITML.prepare_constraints(y, len(x), num_constraints))), (SDML(), (x, SDML.prepare_constraints(y, len(x), num_constraints))), (LSML(), (x, LSML.prepare_constraints(y, num_constraints))) ] for ax_num, (ml,args) in zip(xrange(3,7), mls): ml.fit(*args) tx = ml.transform() ml_knn = nearest_neighbors(tx, k=2) ax = pyplot.subplot(3,2,ax_num) plot_sandwich_data(tx, y, ax) plot_neighborhood_graph(tx, ml_knn, y, ax) ax.set_title('%s space' % ml.__class__.__name__) ax.set_xticks([]) ax.set_yticks([]) pyplot.show()
def test_sdml_raises_warning_non_psd(self): """Tests that SDML raises a warning on a toy example where we know the pseudo-covariance matrix is not PSD""" pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]]) y = [1, -1] sdml = SDML(use_cov=True, sparsity_param=0.01, balance_param=0.5) msg = ("Warning, the input matrix of graphical lasso is not " "positive semi-definite (PSD). The algorithm may diverge, " "and lead to degenerate solutions. " "To prevent that, try to decrease the balance parameter " "`balance_param` and/or to set use_cov=False.") with pytest.warns(ConvergenceWarning) as raised_warning: try: sdml.fit(pairs, y) except Exception: pass # we assert that this warning is in one of the warning raised by the # estimator assert msg in list(map(lambda w: str(w.message), raised_warning))
def test_sdml_raises_warning_msg_not_installed_skggm(self): """Tests that the right warning message is raised if someone tries to use SDML but has not installed skggm, and that the algorithm fails to converge""" # TODO: remove if we don't need skggm anymore # case on which we know that scikit-learn's graphical lasso fails # because it will return a non SPD matrix pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]]) y_pairs = [1, -1] sdml = SDML(use_cov=False, balance_param=100, verbose=True) msg = ("There was a problem in SDML when using scikit-learn's graphical " "lasso solver. skggm's graphical lasso can sometimes converge on " "non SPD cases where scikit-learn's graphical lasso fails to " "converge. Try to install skggm and rerun the algorithm (see " "the README.md for the right version of skggm).") with pytest.raises(RuntimeError) as raised_error: sdml.fit(pairs, y_pairs) assert msg == str(raised_error.value)
def test_sdml_raises_warning_msg_not_installed_skggm(self): """Tests that the right warning message is raised if someone tries to use SDML but has not installed skggm, and that the algorithm fails to converge""" # TODO: remove if we don't need skggm anymore # case on which we know that scikit-learn's graphical lasso fails # because it will return a non SPD matrix pairs = np.array([[[-10., 0.], [10., 0.]], [[0., 50.], [0., -60]]]) y_pairs = [1, -1] sdml = SDML(prior='identity', balance_param=100, verbose=True) msg = ("There was a problem in SDML when using scikit-learn's graphical " "lasso solver. skggm's graphical lasso can sometimes converge on " "non SPD cases where scikit-learn's graphical lasso fails to " "converge. Try to install skggm and rerun the algorithm (see " "the README.md for the right version of skggm).") with pytest.raises(RuntimeError) as raised_error: sdml.fit(pairs, y_pairs) assert msg == str(raised_error.value)
def metricLearning(data): df_label = pd.read_csv('../TestAndLearn/data/outcome_labels.csv') #print("df_label", '\n', df_label) #get unique row ids rowIDLIst = pd.concat([df_label.id1, df_label.id2], axis=0).unique().tolist() print("rowIDLIst", '\n', rowIDLIst) #connectivity graph cmatrix = np.zeros([len(rowIDLIst), len(rowIDLIst)]) #print("as_Matrix", '\n', df_label.as_matrix) for lbl in df_label.as_matrix(): #print ("rowIDLIst.index(lbl[0])", rowIDLIst.index(lbl[0]),"rowIDLIst.index(lbl[1])",rowIDLIst.index(lbl[1])) cmatrix[rowIDLIst.index(lbl[0])][rowIDLIst.index(lbl[1])] = int(lbl[2]) cmatrix[rowIDLIst.index(lbl[1])][rowIDLIst.index(lbl[0])] = int(lbl[2]) print("cmatrix.shape", '\n', cmatrix.shape) trainedData = [] for rid in rowIDLIst: row = data.iloc[[rid]] #print "row","\n",row #print "rowType","\n",type(row) trainedData.append(row) #print "LentrainedData","\n", len(trainedData) #print "typetrainedData1", '\n', len(trainedData) trainedData = pd.concat(trainedData, axis=0).as_matrix() print("trainedData.shape", "\n", trainedData.shape) #print "trainedData2", "\n", trainedData metric = SDML().fit(trainedData, cmatrix) newData = metric.transform(data) return newData
def test_raises_no_warning_installed_skggm(self): # otherwise we should be able to instantiate and fit SDML and it # should raise no warning pairs = np.array([[[-10., 0.], [10., 0.]], [[0., -55.], [0., -60]]]) y_pairs = [1, -1] X, y = make_classification(random_state=42) with pytest.warns(None) as record: sdml = SDML() sdml.fit(pairs, y_pairs) assert len(record) == 0 with pytest.warns(None) as record: sdml = SDML_Supervised(use_cov=False, balance_param=1e-5) sdml.fit(X, y) assert len(record) == 0
def test_tiwafer(): num_constraints = 1500 print "Loading Data...." tiwafer_data = load_data_sdml() sim_pairs = tiwafer_data.sim_pairs diff_pairs = tiwafer_data.diff_pairs sorted_ids = tiwafer_data.sortedIds ti_data = np.array(tiwafer_data.data) labels = np.array(tiwafer_data.target) print "Done Loading Data.\nLearning Distance Metric...." num_points = len(sorted_ids) W = prepare_constraints_old(labels, num_points, num_constraints) sdml = SDML() # W = prepare_constraints(sorted_ids, sim_pairs, diff_pairs) sdml.fit(ti_data, W) W_metric = sdml.metric() cPickle.dump(W_metric, open('W_metric_sdml.p', 'wb')) W_trans = sdml.transformer() with open('W_trans_sdml.p', 'wb') as handle: cPickle.dump(W_trans, handle)
# if preprocessor, we build a 2D array of quadruplets of indices return Dataset(c, target, X, c[:, 0]) else: # if not, we build a 3D array of quadruplets of samples return Dataset(X[c], target, None, X[c[:, 0]]) quadruplets_learners = [(LSML(), build_quadruplets)] ids_quadruplets_learners = list( map(lambda x: x.__class__.__name__, [learner for (learner, _) in quadruplets_learners])) pairs_learners = [ (ITML(max_iter=2), build_pairs), # max_iter=2 to be faster (MMC(max_iter=2), build_pairs), # max_iter=2 to be faster (SDML(use_cov=False, balance_param=1e-5), build_pairs) ] ids_pairs_learners = list( map(lambda x: x.__class__.__name__, [learner for (learner, _) in pairs_learners])) classifiers = [(Covariance(), build_classification), (LFDA(), build_classification), (LMNN(), build_classification), (NCA(), build_classification), (RCA(), build_classification), (ITML_Supervised(max_iter=5), build_classification), (LSML_Supervised(), build_classification), (MMC_Supervised(max_iter=5), build_classification), (RCA_Supervised(num_chunks=10), build_classification), (SDML_Supervised(use_cov=False, balance_param=1e-5), build_classification)] ids_classifiers = list(
def fit(self, X, y): num_constraints = NUM_CONSTRAINTS constraints = SDML.prepare_constraints(y, len(X), num_constraints) return super(SDML_sk, self).fit(X, constraints)
# if preprocessor, we build a 2D array of quadruplets of indices return Dataset(c, target, X, c[:, 0]) else: # if not, we build a 3D array of quadruplets of samples return Dataset(X[c], target, None, X[c[:, 0]]) quadruplets_learners = [(LSML(), build_quadruplets)] ids_quadruplets_learners = list( map(lambda x: x.__class__.__name__, [learner for (learner, _) in quadruplets_learners])) pairs_learners = [ (ITML(max_iter=2), build_pairs), # max_iter=2 to be faster (MMC(max_iter=2), build_pairs), # max_iter=2 to be faster (SDML(prior='identity', balance_param=1e-5), build_pairs) ] ids_pairs_learners = list( map(lambda x: x.__class__.__name__, [learner for (learner, _) in pairs_learners])) classifiers = [(Covariance(), build_classification), (LFDA(), build_classification), (LMNN(), build_classification), (NCA(), build_classification), (RCA(), build_classification), (ITML_Supervised(max_iter=5), build_classification), (LSML_Supervised(), build_classification), (MMC_Supervised(max_iter=5), build_classification), (RCA_Supervised(num_chunks=5), build_classification), (SDML_Supervised(prior='identity', balance_param=1e-5), build_classification)] ids_classifiers = list(
#print ("lbl[0]",lbl[0]) #print ("lbl[1]",lbl[1]) #print ("lbl[2]",lbl[2]) #print ("rowIDLIst.index(lbl[0])", rowIDLIst.index(lbl[0]),"rowIDLIst.index(lbl[1])",rowIDLIst.index(lbl[1])) cmatrix[rowIDLIst.index(lbl[0])][rowIDLIst.index(lbl[1])] = int(lbl[2]) cmatrix[rowIDLIst.index(lbl[1])][rowIDLIst.index(lbl[0])] = int(lbl[2]) print "cmatrix.shape", '\n', cmatrix.shape trainedData = [] for rid in rowIDLIst: row = df_reperent.iloc[[rid]] #print "row","\n",row #print "rowType","\n",type(row) trainedData.append(row) #print "LentrainedData","\n", len(trainedData) #print "typetrainedData1", '\n', len(trainedData) trainedData = pd.concat(trainedData, axis=0).as_matrix() print "trainedData.shape", "\n", trainedData.shape #print "trainedData2", "\n", trainedData metric = SDML().fit(trainedData, cmatrix) newData = metric.transform(df_reperent) print type(newData) print newData.shape
# if preprocessor, we build a 2D array of quadruplets of indices return Dataset(c, target, X, c[:, 0]) else: # if not, we build a 3D array of quadruplets of samples return Dataset(X[c], target, None, X[c[:, 0]]) quadruplets_learners = [(LSML(), build_quadruplets)] ids_quadruplets_learners = list( map(lambda x: x.__class__.__name__, [learner for (learner, _) in quadruplets_learners])) pairs_learners = [ (ITML(), build_pairs), (MMC(max_iter=2), build_pairs), # max_iter=2 for faster (SDML(), build_pairs), ] ids_pairs_learners = list( map(lambda x: x.__class__.__name__, [learner for (learner, _) in pairs_learners])) classifiers = [(Covariance(), build_classification), (LFDA(), build_classification), (LMNN(), build_classification), (NCA(), build_classification), (RCA(), build_classification), (ITML_Supervised(max_iter=5), build_classification), (LSML_Supervised(), build_classification), (MMC_Supervised(max_iter=5), build_classification), (RCA_Supervised(num_chunks=10), build_classification), (SDML_Supervised(), build_classification)] ids_classifiers = list( map(lambda x: x.__class__.__name__,