def sfa_transfer(Xs, Ys, Xt, Yt, clf_class=LogisticRegression, feat_type=2, nclusters=100, num_di_words=1000): Xsn = termweight.term_weighting(Xs, feat_type) Xtn = termweight.term_weighting(Xt, feat_type) Xs_sfa, Xt_sfa = spectral_feature_alignement(Xsn, Xtn, num_di_words, nk=nclusters, normalize=True) Xs_tfidf = termweight.term_weighting(Xs, feat_type) Xt_tfidf = termweight.term_weighting(Xt, feat_type) Xs_all = append_features(Xs_tfidf, Xs_sfa, gamma=0.5) Xt_all = append_features(Xt_tfidf, Xt_sfa, gamma=0.5) acc = no_transfer(Xs, Ys, Xt, Yt, clf_class=clf_class) acc_sfa = no_transfer(Xs_all, Ys, Xt_all, Yt, clf_class=clf_class) return acc, acc_sfa
def mda_exp(Xs, Ys, Xt, Yt, clf_class=LogisticRegression, noise=0.9, feat_type=2, layer_func=lambda x: layer_function(x, 3), filter_W_option=0, topk=50, cross_valid=True, use_Xr=True, use_bias=True): #Stack Dataset Together ndocs_source = Xs.shape[0] ndocs_target = Xt.shape[0] X_all = sp.vstack([Xs, Xt]) word_selected = get_most_frequent_features(X_all, 5000) if feat_type > 0: X_all = term_weighting(X_all, feat_type=feat_type) Xdw_most_frequent = X_all[:, word_selected] acc_bow = domain_adaptation_baseline.no_transfer(X_all[:ndocs_source, :], Ys, X_all[ndocs_source:, :], Yt) #acc_bow=-1 #print "BOW Baseline",acc_bow if use_Xr: hw, W = denoising_autoencoders.mDA(X_all.T, noise, 1e-2, layer_func=layer_func, Xr=Xdw_most_frequent.T, filter_W_option=filter_W_option, topk=topk) else: if use_bias: hw, W = denoising_autoencoders.mDA(X_all.T, noise, 1e-2, layer_func=layer_func, filter_W_option=filter_W_option, topk=topk) else: print("Without Bias ....") hw, W = denoising_autoencoders.mDA_without_bias( X_all.T, noise, 1e-2, layer_func=layer_func) accuracy = evaluate_mda_features(hw, Ys, Yt, ndocs_source, clf_class, cross_valid=cross_valid) return acc_bow, accuracy
def make_matlab_dataset(type='small', outname="amazon_small_10p4_features.mat", feat_type=0, max_words=10000): dvd_name = get_dataset_path('dvd', type) books_name = get_dataset_path('books', type) electronics_name = get_dataset_path('electronics', type) kitchen_name = get_dataset_path('kitchen', type) dataset_list = [dvd_name, books_name, electronics_name, kitchen_name] datasets, dico = dataset_utils.parse_processed_amazon_dataset( dataset_list, max_words=max_words) L_dvd, Y_dvd = datasets[dvd_name] L_books, Y_books = datasets[books_name] L_elec, Y_elec = datasets[electronics_name] L_kit, Y_kit = datasets[kitchen_name] X_dvd = count_list_to_sparse_matrix(L_dvd, dico) X_books = count_list_to_sparse_matrix(L_books, dico) X_elec = count_list_to_sparse_matrix(L_elec, dico) X_kit = count_list_to_sparse_matrix(L_kit, dico) if feat_type > 0: X_dvd = termweight.term_weighting(X_dvd, feat_type) X_books = termweight.term_weighting(X_books, feat_type) X_elec = termweight.term_weighting(X_elec, feat_type) X_kit = termweight.term_weighting(X_kit, feat_type) A = { "X_dvd": X_dvd, "X_boo": X_books, "X_ele": X_elec, "X_kit": X_kit, "Y_dvd": Y_dvd, "Y_boo": Y_books, "Y_ele": Y_elec, "Y_kit": Y_kit } scipy.io.savemat(outname, A)
def lsi_transfer(Xs, Ys, Xt, Yt, clf_class=LogisticRegression, feat_type=2, lsi_rank=100, use_original_features=False, use_singular_values=True, l2normalization=False): ndocs_source = Xs.shape[0] ndocs_target = Xt.shape[0] if sp.issparse(Xs): X_all = sp.vstack([Xs, Xt]) else: X_all = np.vstack([Xs, Xt]) X = termweight.term_weighting(X_all, feat_type) if sp.issparse(X): U, S, V = LSI(X, lsi_rank) if use_singular_values: Un = np.dot(U, np.diag(np.sqrt(S))) else: Un = U else: #Randomized SVD Here svd = TruncatedSVD(n_components=lsi_rank) Un = svd.fit_transform(X) if l2normalization: Un = sklearn.preprocessing.normalize(Un) Us = Un[:ndocs_source, :] Ut = Un[ndocs_source:, :] if use_original_features is True: Xs_n = X_all[:ndocs_source, :] Xt_n = X_all[ndocs_source:, :] Us = append_features(Xs_n, Us, gamma=0.5) Ut = append_features(Xt_n, Ut, gamma=0.5) return no_transfer(Us, Ys, Ut, Yt, clf_class)
def lsi_mda(Xs, Ys, Xt, Yt, clf_class=LogisticRegression, noise=0.9, feat_type=2, layer_func=lambda x: layer_function(x, 1), lsi_rank=100): #First MDA and then LSI #Stack Dataset Together ndocs_source = Xs.shape[0] ndocs_target = Xt.shape[0] X_all = sp.vstack([Xs, Xt]) X = term_weighting(X_all, feat_type) word_selected = get_most_frequent_features(X_all, 5000) Xdw_most_frequent = X_all[:, word_selected] hx, _ = denoising_autoencoders.mDA(X.T, noise, layer_func=layer_func, Xr=Xdw_most_frequent.T, reg_lambda=1e-2) X_all_dafeatures = hx.T Xs_mda = X_all_dafeatures[:ndocs_source, :] Xt_mda = X_all_dafeatures[ndocs_source:, :] return domain_adaptation_baseline.lsi_transfer(Xs_mda, Ys, Xt_mda, Yt, clf_class, feat_type=0, lsi_rank=lsi_rank)
def fit(self): #Cross Validate Classifier ndocs_source = self.Xs.shape[0] ndocs_target = self.Xt.shape[0] if sp.issparse(self.Xs): X_all = sp.vstack([self.Xs, self.Xt]) else: X_all = np.vstack([self.Xs, self.Xt]) X = termweight.term_weighting(X_all, self.feat_type) if sp.issparse(X): U, S, V = LSI(X, self.k) self.V = V self.S = S if self.use_singular_values: Un = np.dot(U, np.diag(np.sqrt(S))) else: Un = U else: #Randomized SVD Here svd = TruncatedSVD(n_components=self.k) Un = svd.fit_transform(X) if self.l2norm: sklearn.preprocessing.normalize(Un, 'l2', copy=False) Us = Un[:ndocs_source, :] Ut = Un[ndocs_source:, :] #Train Semi-Supervised if self.use_original_features is True: Xs_n = X_all[:ndocs_source, :] Us = append_features(Xs_n, Us, gamma=0.5) #return no_transfer(Us,self.Ys,Ut,Yt,clf_class) self.clf.fit(Us, self.Ys)
def msda_classifier(Xs, Ys, Xt, Yt, noise=0.9, feat_type=0, score='AUC', clf=None, layer_func=np.tanh, self_learning=False): tfidf_trans = TfidfTransformer() if feat_type == 8: Xsn = term_weighting(Xs, feat_type) Xtn = term_weighting(Xt, feat_type) elif feat_type == 2: Xsn = tfidf_trans.fit_transform(Xs) Xtn = tfidf_trans.transform(Xt) else: Xsn = Xs Xtn = Xt #If not Classifier are given, we cross-validate one if not (clf): clf_cv = domain_adaptation_baseline.cross_validate_classifier( Xsn, Ys, sklearn.linear_model.LogisticRegression, n_jobs=5) else: clf_cv = clf clf_cv = clf.fit(Xsn, Ys) no_transfer_acc = clf_cv.score(Xtn, Yt) proba = clf_cv.predict_proba(Xtn) nclasses = proba.shape[1] multiclass = nclasses > 2 if not (multiclass): Py_d = proba[:, 1] vect_prob = np.zeros((Xt.shape[0], 1), dtype='f') vect_prob[:, 0] = Py_d[:] #Xt_augment=domain_adaptation_baseline.append_features(Xt,vect_prob) Xt_augment = domain_adaptation_baseline.append_features(Xtn, vect_prob) else: #TODO Try to do it Per Class Xt_augment = domain_adaptation_baseline.append_features(Xtn, proba) if self_learning: Ytpred = clf_cv.predict(Xtn) clf_cv = domain_adaptation_baseline.cross_validate_classifier( Xtn, Ytpred, sklearn.linear_model.LogisticRegression) no_transfer_acc = clf_cv.score(Xtn, Yt) ''' log_proba = np.log( clf_cv.predict_proba(Xtn) +0.0000001) #log_proba = clf_cv.predict_log_proba(Xtn) Py_d = log_proba[:,1] -np.log(0.5) vect_prob=np.zeros((Xt.shape[0],1),dtype='f') vect_prob[:,0]=Py_d[:] Xt_augment=domain_adaptation_baseline.append_features(Xtn,vect_prob) ''' hw, W = denoising_autoencoders.mDA(Xt_augment.T, noise, 0.05, layer_func=layer_func) h = hw.T if not (multiclass): #TODO This is dangerous if I swap label 0 and 1 as decision, no ? m_score = sklearn.metrics.accuracy_score(Yt, h[:, -1] > 0.5) #m_score = sklearn.metrics.accuracy_score(Yt,(h[:,-1]-np.log(0.5))>0) model_AUC = sklearn.metrics.roc_auc_score(Yt, h[:, -1]) baseline_AUC = sklearn.metrics.roc_auc_score(Yt, Py_d) print "AUC", baseline_AUC, model_AUC if score == 'AUC': return (baseline_AUC, model_AUC) else: return (no_transfer_acc, m_score) else: hy_reconstruction = h[:, -nclasses:] y_pred = np.argmax(hy_reconstruction, axis=1) m_score = sklearn.metrics.accuracy_score(Yt, y_pred) if score == 'AUC': raise NotImplementedError else: return (no_transfer_acc, m_score)