def PreprocessingRBM(self, components, MNE_coefficients, N_neighbors): """ :type MNE_coefficients: int :param MNE_coefficients: number of coefficnents for mns projection :param N_neighbors: number of neighbors for embedding """ self.MNE_coefficients = MNE_coefficients self.N_neighbors = N_neighbors self.rbm = neural_network.BernoulliRBM(n_components=components, learning_rate=0.05, batch_size=10, n_iter=100, verbose=0, random_state=0) self.Embedding = manifold.SpectralEmbedding( n_components=self.MNE_coefficients, affinity='nearest_neighbors', gamma=None, random_state=0, n_neighbors=self.N_neighbors) self.X_rbm = self.rbm.fit_transform(self.Waves_Coefficients) self.X_red = self.Embedding.fit_transform(self.X_rbm) return self.X_red
out_log.flush() # # looping over all parameters combinations for n_hidden in n_hidden_values: for l_rate in learning_rate_values: for batch_size in batch_size_values: for n_iters in n_iter_values: logging.info('Learning RBM for {} {} {} {}'.format( n_hidden, l_rate, batch_size, n_iters)) # # learning rbm = neural_network.BernoulliRBM( n_components=n_hidden, learning_rate=l_rate, batch_size=batch_size, n_iter=n_iters, verbose=args.verbose - 1, random_state=rand_gen) fit_s_t = perf_counter() rbm.fit(train) fit_e_t = perf_counter() logging.info('Trained in {} secs'.format(fit_e_t - fit_s_t)) # # evaluating training eval_s_t = perf_counter() train_plls = rbm.score_samples(train) eval_e_t = perf_counter() train_avg_pll = numpy.mean(train_plls)
# In Sample R2 ens7_insample_pred = ens7.predict(df) print(r2_score(train.y, ens7_insample_pred )) # # Predict ens7_pred = ens7.predict(df_test) # LB: submission = pd.read_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/input/sample_submission.csv') submission.y = ens7_pred submission.id = id submission.columns = ['ID', 'y'] submission.to_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/output/layer2_nn_mlp.csv', index=False) print("Ensemble Model 8: neural_network.BernoulliRBM(") ens8 = neural_network.BernoulliRBM(n_components=256, learning_rate=0.1, batch_size=10, n_iter=10, verbose=0, random_state=None) ens8.fit(df, train.y) # In Sample R2 ens8_insample_pred = ens8.predict(df) print(r2_score(train.y, ens8_insample_pred)) # # Predict ens8_pred = ens8.predict(df_test) # LB: submission = pd.read_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/input/sample_submission.csv') submission.y = ens8_pred submission.id = id submission.columns = ['ID', 'y'] submission.to_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/output/layer2_nn_rbm.csv', index=False)
def predict(train_list, train_result, test_list, method_list, **kwargs): def fit_predict_each_output(model, target): __predict_result = [] for idx in range(np.size(target, 1)): model.fit(train_list, target[:, idx]) __predict_result.append(model.predict(test_list)) return np.transpose(np.asarray(__predict_result)) def fit_predict(model, target): model.fit(train_list, target) return model.predict(test_list) from_bins_idx = kwargs["from_bins_idx"] to_bins_idx = kwargs["to_bins_idx"] _binned_train_result = to_bins_idx(train_result) _predict_result = [] if "current" in method_list: rbm = neural_network.BernoulliRBM(n_components=512, verbose=False, n_iter=100, learning_rate=1e-2, random_state=0) rbm.fit(train_list) rbm.fit(test_list) _predict_result.append(np.transpose(np.asarray(__predict_result))) elif "knn" in method_list: _ = knn_predict(train_list, _binned_train_result, test_list, k=kwargs["k"]) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "dt" in method_list: _ = fit_predict(tree.DecisionTreeClassifier(max_depth=kwargs["max_depth"]), _binned_train_result) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "rf" in method_list: _ = fit_predict(ensemble.RandomForestClassifier(n_estimators=kwargs["n_estimators"], max_depth=kwargs["max_depth"], n_jobs=kwargs["n_jobs"]), _binned_train_result) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "average" in method_list: _ = average_predict(train_result, test_list) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "adaboost" in method_list: _ = fit_predict_each_output(ensemble.AdaBoostClassifier(), _binned_train_result) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "ridge" in method_list: _ = fit_predict_each_output(linear_model.RidgeClassifier(), _binned_train_result) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "linear" in method_list: _predict_result.append(fit_predict_each_output(linear_model.LinearRegression(), train_result)) elif "huber" in method_list: _predict_result.append(fit_predict_each_output(linear_model.HuberRegressor(), train_result)) elif "theilsen" in method_list: _predict_result.append(fit_predict_each_output(linear_model.TheilSenRegressor(), train_result)) elif "lasso" in method_list: _predict_result.append(fit_predict_each_output(linear_model.Lasso(), train_result)) elif "par" in method_list: _predict_result.append(fit_predict_each_output(linear_model.PassiveAggressiveRegressor(C=kwargs["par_C"], epsilon=kwargs["par_eps"]), train_result)) elif "ridge_reg" in method_list: _predict_result.append(fit_predict_each_output(linear_model.Ridge(), train_result)) elif "dt_reg" in method_list: _predict_result.append(fit_predict(tree.DecisionTreeRegressor(max_depth=kwargs["max_depth"]), train_result)) elif "rf_reg" in method_list: _predict_result.append(fit_predict(ensemble.RandomForestRegressor(max_depth=kwargs["max_depth"], n_jobs=kwargs['n_jobs'], n_estimators=kwargs['n_estimators']), train_result)) elif "xgboost" in method_list: _predict_result.append(fit_predict_each_output(xgb.XGBClassifier(max_depth=kwargs["max_depth"], n_estimators=kwargs['n_estimators'], nthread=kwargs["nthread"]), _binned_train_result)) elif "xgboost_reg" in method_list: _predict_result.append(fit_predict_each_output(xgb.XGBRegressor(max_depth=kwargs["max_depth"], n_estimators=kwargs['n_estimators'], nthread=kwargs["nthread"]), train_result)) elif "svr" in method_list: _predict_result.append(fit_predict_each_output(svm.SVR(C=kwargs["C"], epsilon=kwargs["epsilon"]), train_result)) elif "linear_svr" in method_list: _predict_result.append(fit_predict_each_output(svm.LinearSVR(C=kwargs["C"], epsilon=kwargs["epsilon"]), train_result)) else: assert False, "invalid method" return np.asarray(_predict_result)
def main(K, numfeatures, sample_file, num_display_words, outputfile): K_clusters = K vectorizer = TfidfVectorizer(max_df=0.5, max_features=numfeatures, min_df=2, stop_words='english', ngram_range=(1, 2), use_idf=True) text = [] with open(sample_file, 'r') as f: text = f.readlines() labels = [] with open('hygiene.dat.labels', 'r') as f: labels = f.readlines() labels = map(str.strip, labels) text_reference = [] #with open ('hygiene.data.additional', 'r') as f: text_reference = pd.read_csv( 'hygiene.dat.additional', names=['cuisines', 'zip_code', 'number_of_reviews', 'average_rating'], header=None) label_encoder = preprocessing.LabelEncoder() transformed_zip_codes = label_encoder.fit_transform( text_reference['zip_code']) cuisines_dict_list = [] for cuisine_list in text_reference['cuisines']: a_cuisines_dict = {} for a_cuisine in cuisine_list.replace('[', '').replace( ']', '').strip().split(','): a_cuisines_dict[a_cuisine.strip()] = 1 cuisines_dict_list.append(a_cuisines_dict) vec = DictVectorizer() test_cuisine_features = vec.fit_transform(cuisines_dict_list).toarray() test_cuisine_features = pd.DataFrame(test_cuisine_features) total_cuisine_features = len(vec.get_feature_names()) cuisines_positive_corr = {} for i in range(total_cuisine_features): #pearsons_coeff=np.corrcoef(map(int,test_cuisine_features[i][:546]),map(int,labels[:546]))[0, 1] pearsons_coeff = scipy.stats.pearsonr( map(int, test_cuisine_features[i][:546]), map(int, labels[:546])) if pearsons_coeff > 0: print vec.get_feature_names()[i] cuisines_positive_corr[vec.get_feature_names()[i]] = 1 #Now consider cuisines with only positive correlation cuisines_dict_list = [] for cuisine_list in text_reference['cuisines']: a_cuisines_dict = {} for a_cuisine in cuisine_list.replace('[', '').replace( ']', '').strip().split(','): if a_cuisine.strip() in cuisines_positive_corr: a_cuisines_dict[a_cuisine.strip()] = 1 cuisines_dict_list.append(a_cuisines_dict) transformed_cuisines_features = vec.fit_transform( cuisines_dict_list).toarray() #logging.basicConfig(format='%asctime)s: %(levelname)s : %(message)s',level=logging.INFO) #t0 = time() print( "Extracting features from the training dataset using a sparse vectorizer" ) X = vectorizer.fit_transform(text) #print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) # mapping from feature id to acutal word id2words = {} for i, word in enumerate(vectorizer.get_feature_names()): id2words[i] = word #t0 = time() print("Applying topic modeling, using LDA") print(str(K_clusters) + " topics") corpus = matutils.Sparse2Corpus(X, documents_columns=False) lda = models.ldamodel.LdaModel(corpus, num_topics=K_clusters, id2word=id2words, iterations=1000) #print("done in %fs" % (time() - t0)) doc_topics_list = lda.get_document_topics(corpus) doc_topics = sparse.csr_matrix((13299, K_clusters)) for i, doc_a in enumerate(doc_topics_list): for (my_topic_a, weight_a) in doc_a: doc_topics[i, my_topic_a] = weight_a #doc_topics[i,K_clusters] = transformed_zip_codes[i] #doc_topics[i,(K_clusters+1)] = text_reference['number_of_reviews'][i] #doc_topics[i,(K_clusters+2)] = text_reference['average_rating'][i] #doc_topics=sparse.hstack([doc_topics,transformed_cuisines_features]).todense() zip_codes_sparse = sparse.csr_matrix(transformed_zip_codes).transpose() num_of_reviews_sparse = sparse.csr_matrix( text_reference['number_of_reviews']).transpose() avg_rating_sparse = sparse.csr_matrix( text_reference['average_rating']).transpose() doc_topics_features = sparse.hstack([ doc_topics, transformed_cuisines_features, zip_codes_sparse, num_of_reviews_sparse, avg_rating_sparse ]).todense() #output_text = [] #for i, item in enumerate(lda.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)): # output_text.append("Topic: " + str(i)) # for weight,term in item: # output_text.append( term + " : " + str(weight) ) #print "writing topics to file:", outputfile #with open ( outputfile, 'w' ) as f: # f.write('\n'.join(output_text)) clf = neural_network.BernoulliRBM() #start = time.time() train_docs_vectors = doc_topics_features[:546] train_labels = labels[:546] train_labels = map(int, train_labels) #cols=train_docs_vectors.columns[1:] #predicted_label=['CG'] X_train, X_test, y_train, y_test = cross_validation.train_test_split( train_docs_vectors, train_labels, test_size=0.3, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) np.set_printoptions(precision=2) print('Confusion matrix, without normalization') print(cm) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print('Normalized confusion matrix') print(cm_normalized) print( cross_validation.cross_val_score(clf, train_docs_vectors, train_labels, cv=20, scoring="f1_macro").mean()) #print(metrics.accuracy_score(train_labels, predicted)) #print(f1_score(y_test, y_pred, average='binary')) #lb = preprocessing.LabelBinarizer() #y_test=lb.fit(y_test) #y_pred=lb.fit(y_pred) #print(cross_validation.cross_val_score(clf,y_test,y_pred,scoring="f1")) test_docs_vectors = doc_topics_features[546:] predicted_labels = (clf.predict(test_docs_vectors)).tolist() output_text = ["CG"] for label in predicted_labels: output_text.append(str(label)) print "writing predicted labels to file: competition.txt" with open('competition.txt', 'w') as f: f.writelines("%s\n" % item for item in output_text)
title="Non-Hinge dir") images = getImages(hingeDir, nonhingeDir) #Getting training data print "Got the images!" # print zip(*images[1]) shuffle(images) # print zip(*images[1]) trainingImages = images[len(images) // 2:] #Splits images into testing and training sets testingImages = images[:len(images) // 2] # classifier = svm.SVC(gamma=userGamma, tol=tolerence) print "Initializing NN!" # classifier = neural_network.MLPClassifier(hidden_layer_sizes=(400, 300, 200, 100), solver="lbfgs", max_iter=10000, alpha=.0001, activation="tanh", verbose=False) # classifier = tree.DecisionTreeClassifier(criterion="gini", splitter="best", max_features="auto") logistic = linear_model.LogisticRegression() rbm = neural_network.BernoulliRBM() print("Starting to fit, hold on tight!") # print zip(*trainingImages)[1] classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = 0.03 rbm.n_iter = 15 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 logistic.C = 6000.0 # Training RBM-Logistic Pipeline classifier.fit(zip(*trainingImages)[0], zip(*trainingImages)[1]) # Training Logistic regression
def dtc(self): #将y转化为一维形式:self.y_train,self.y_test self.y01_train = list() self.y01_test = list() for a in range(len(self.y_train)): self.y01_train.append(self.y_train[a][0]) for b in range(len(self.y_test)): self.y01_test.append(self.y_test[b][0]) # 获取 N_components if self.nc_edit.text().strip(): nc = int(self.nc_edit.text()) else: nc = 256 # 获取 Learning_rate if self.le_edit.text().strip(): le = float(self.le_edit.text()) else: le = 0.1 # 获取 Batch_size if self.ba_edit.text().strip(): ba = int(self.ba_edit.text()) else: ba = 10 # 获取 N_iter if self.ni_edit.text().strip(): ni = int(self.ni_edit.text()) else: ni = 10 # 获取 Verbose if self.ve_edit.text().strip(): ve = int(self.ve_edit.text()) else: ve = 0 #LDA算法实现 self.clf = neural_network.BernoulliRBM(batch_size=ba, learning_rate=le, n_components=nc, n_iter=ni, random_state=None, verbose=ve) #训练模型 self.clf.fit(self.x_train) #对训练集中x进行转换 self.train_x = self.clf.transform(self.x_train) #对测试集中x进行转换 self.test_x = self.clf.transform(self.x_test) ''' 该模块是对dtable_train模块进行设置,即显示训练集的训练结果 ''' #设置单元格的行数和列数 self.dtable_train.setRowCount(len(self.train_x)) self.dtable_train.setColumnCount(len(self.train_x[0])) for s in range(len(self.train_x)): if s / 2.0 == 0: for s01 in range(len(self.train_x[0])): self.dtable_train.setItem( s, s01, QtGui.QTableWidgetItem(str(self.train_x[s][s01]))) self.dtable_train.item(s, s01).setBackgroundColor( QtGui.QColor(214, 71, 0)) else: for s01 in range(len(self.train_x[0])): self.dtable_train.setItem( s, s01, QtGui.QTableWidgetItem(str(self.train_x[s][s01]))) ''' 该模块是对dtable_test模块进行设置,显示测试集的测试结果 ''' #设置单元格的行数和列数 self.dtable_test.setRowCount(len(self.test_x)) self.dtable_test.setColumnCount(len(self.test_x[0])) for s in range(len(self.test_x)): if s / 2.0 == 0: for s01 in range(len(self.test_x[0])): self.dtable_test.setItem( s, s01, QtGui.QTableWidgetItem(str(self.test_x[s][s01]))) self.dtable_test.item(s, s01).setBackgroundColor( QtGui.QColor(214, 71, 0)) else: for s01 in range(len(self.test_x[0])): self.dtable_test.setItem( s, s01, QtGui.QTableWidgetItem(str(self.test_x[s][s01])))
#load params, transform data_train, data_test else: print 'extracting features with SGVB auto-encoder, default is 10 iterations...' encoder = sgvb.SGVB(verbose=True) encoder.fit(x_train) SGVB_train_features = encoder.transform(data_train) SGVB_test_features = encoder.transform(data_test) print 'done' print 'extracting features with RBM...' n_components = 200 learning_rate = 0.01 batch_size = 100 rbm = neural_network.BernoulliRBM(n_components, learning_rate, batch_size, verbose=True) rbm.fit(x_train) RBM_train_features = rbm.transform(data_train) RBM_test_features = rbm.transform(data_test) print 'done' print 'performing logistic regression on raw data...' LogReg_raw = linear_model.LogisticRegression() LogReg_raw.fit(data_train, t_train) raw_score = LogReg_raw.score(data_test, t_test) print 'Test score on raw data = ', raw_score print 'performing logistic regression on RBM features...' LogReg_RBM = linear_model.LogisticRegression() LogReg_RBM.fit(RBM_train_features, t_train)