def test() : import LoadData trainMat, classLabelVector = LoadData.loadTrainDataFromCSV(TRAIN_FILE) testMat = LoadData.loadTestDataFromCSV(TEST_FILE) rfbenchmarkVector = LoadData.loadRFBenchmarkFromCSV(RF_BENCHMARK_FILE) columnLabels = [] for i in range(1, 785) : columnLabels.append(i) m = int(len(columnLabels) ** 0.5) # rf = createRandomForest(4, trainMat[50:], classLabelVector[50:], columnLabels, m) rf = createRandomForest(10, trainMat, classLabelVector, columnLabels, m) # testMat = trainMat[0:50] i = 0 n = 0 for testData in testMat : classList = [] for tree in rf : label = classify(tree, columnLabels, testData) classList.append(label) voteLabel = majorityCnt(classList) if voteLabel == rfbenchmarkVector[i] : n += 1 # print "the real answer is ", classLabelVector[i], "the label is ", voteLabel i += 1 print n accuracy = n / float(len(rfbenchmarkVector)) print accuracy
def BPNet(file_name): rengong_filename = r'C:\Users\Administrator\Desktop\yanshan_rengong1.tif' P = [] T = [] butoushui_P = LoadData(1, file_name, rengong_filename) butoushui_P = RD.sample(butoushui_P, 2000) butoushui_P = sio.loadmat('../JadeLibSVM/' + 'butoushui_P.mat')['butoushui_P'] M = len(butoushui_P) P = butoushui_P P = butoushui_P.tolist() T = [1] * M print M toushui_P = LoadData(0, file_name, rengong_filename) toushui_P = RD.sample(toushui_P, 2000) toushui_P = sio.loadmat('../JadeLibSVM/' + 'toushui_P.mat')['toushui_P'] M = len(toushui_P) P.extend(toushui_P) toushui_P = [0] * M T.extend(toushui_P) print M nn = NeuralNetwork([3, 2, 1], 'tanh') nn.fit(P, T, 0.01, 5000) print('**************训练结束****************') p_test = extract_Yanshan('') predict_label = [] for i in p_test: predict_label.append(nn.predict(i)[0]) pic = array(Image.open(file_name)) X = pic.shape[0] Y = pic.shape[1] P = pic.shape[2] Test_data = np.zeros((X * Y, 3), dtype='double') k = 0 for i in range(X): for j in range(Y): Test_data[k, 0] = pic[i, j, 0] Test_data[k, 1] = pic[i, j, 1] Test_data[k, 2] = pic[i, j, 2] k = k + 1 result = np.zeros((X, Y, 3)) #RGB彩图 for k in range(X * Y): # R分量 G分量 B分量 if (predict_label[k] >= 0.5): Test_data[k, 0] = 1 Test_data[k, 1] = 1 Test_data[k, 2] = 1 #白色 elif (predict_label[k] < 0.5): Test_data[k, 0] = 0 Test_data[k, 1] = 0 Test_data[k, 2] = 0 #%黑色 k = 0 for i in range(X): for j in range(Y): result[i, j, 0] = Test_data[k, 0] result[i, j, 1] = Test_data[k, 1] result[i, j, 2] = Test_data[k, 2] k = k + 1 return result
def for_altair(stat_data): """ :param stat_data: :param time_data: :return: """ def country_search(name): country = pycountry.countries.get(name=name) if country is not None: return country.numeric else: country = pycountry.countries.search_fuzzy(name) return country[0].numeric print('country loop') stat_data['CountryCode'] = np.vectorize(country_search)(stat_data['country']) # print('melt') # data = pd.melt(stat_data, id_vars=['country', # 'date', # 'tournament', # 'total_games', # 'percent_wins', # 'CountryCode', # 'home_wins', # 'away_wins'], # value_vars=['percent_home_wins', # 'percent_away_wins'], # var_name='statistics', # value_name='values') DataLoader.save_to_sql(stat_data, "final_data") return stat_data
def process_all_genre(): tf_list = [] tf_idf_list = [] genres = __get_all_genre_list__() for genre in genres: tf_value = __get_tf_for_genre__(genre) for entry in tf_value: entry_dict = {'genre': genre} for key, value in entry.iteritems(): entry_dict[key] = value tf_list.append(entry_dict) tf_idf_value = __get_tfidf_for_genre__(genre) for entry in tf_idf_value: entry_dict = {'genre': genre} for key, value in entry.iteritems(): entry_dict[key] = value tf_idf_list.append(entry_dict) tf_data_frame = Data.pd.DataFrame(tf_list) tf_idf_data_frame = Data.pd.DataFrame(tf_idf_list) tf_data_frame['tfidfweight'] = tf_data_frame.apply( lambda new_row: tf_idf_data_frame[tf_idf_data_frame['tag'] == new_row[ 'tag']].iloc[0].tfidfweight, axis=1) Data.save_df(tf_data_frame, 'Genre-Model.csv')
def process_user_model(): tf_list = [] tf_idf_list = [] count = 0 for user_id in Data.ml_ratings['userid'].unique(): if count < 1000: tf_data = __get_tf_info__(user_id) for entry in tf_data: entry_dict = {} for key, value in entry.iteritems(): entry_dict[key] = value tf_list.append(entry_dict) tf_idf_data = __get_tfidf_info__(user_id) for entry in tf_idf_data: entry_dict = {} for key, value in entry.iteritems(): entry_dict[key] = value tf_idf_list.append(entry_dict) count += 1 tf_data_frame = Data.pd.DataFrame(tf_list) if_idf_data_frame = Data.pd.DataFrame(tf_idf_list) tf_data_frame['tfidfweight'] = tf_data_frame.apply( lambda new_row: if_idf_data_frame[if_idf_data_frame['tag'] == new_row[ 'tag']].iloc[0].tfidfweight, axis=1) Data.save_df(tf_data_frame, 'User-Model.csv')
def Train_Model(): model = NNModel(0.001) # 28 * 28 == 784 model.add_layer(28 * 28, 10, Func.relu) model.add_layer(10, 10, Func.relu) model.add_layer(10, 10, Func.relu) model.add_layer(10, 2, Func.identiti) model.add_loss_function(Func.cross_entropy_loss) epoch = 1000 examples = 1000 batch = 10 x, y = LoadData.load_next_batch(examples, 0) x_test, y_test = LoadData.load_next_batch(examples, examples) for i in range(epoch): train_loss = 0 for j in range(0, examples - batch, batch): # print(x.shape) train_loss += model.train(x[j:j + batch], y[j:j + batch]) print('Train loss is at:', train_loss) loss = 0 correct_guesses = 0 for j in range(0, examples, 1): x_, y_ = x_test[j], y_test[j] output = model.forward_pass(x_) output = Func.sigm.f(output) # print(output) # print(y_) # print() loss += -np.sum(y_ * np.log(output)) if np.argmax(y_) == np.argmax(output): correct_guesses += 1 print('Loss after', i + 1, ':', loss) print('Correct guesses after ', i + 1, ':', correct_guesses / examples)
def main(): global domain global domain_distance global distance_file if not len(sys.argv) == 4: print( "python testBetaIC.py <k> <beta> <domain number> \n Domain num: \n 0 : accident, 1: sanitation, 2: crime, 3: adult" ) return k = int(sys.argv[1]) #50 beta = float(sys.argv[2]) domain_num = int(sys.argv[3]) domain = domain_arr[domain_num] domain_distance = distance_arr[domain_num] print(domain + " " + domain_distance) Ld = LoadData(domain) G = Ld.readFile() distance_file = "" if (os.path.isfile(domain + "_distance.txt")): distance_file = domain + "_distance.txt" print("Dataset:", domain, "K = ", k, "Distance:", domain_distance, "beta=", beta) aff_array = test_Kcenter(G, k, domain, domain_distance) print("\n") print( "#######################################################################\n" ) bs = betaStrong(domain, G, aff_array, k, beta, domain_distance, distance_file) aff_array = bs.beta_IC() calculate_composition(G, k, aff_array, domain) del Ld
def ModelInit(self, filename): Docs = LoadData.LoadDataFromFile(os.getcwd() + "/" + filename) self.D = len(Docs) print "Load ", self.D, " docs from the file" StopWordList = LoadData.LoadStopWords() WordListSet = [ Preprocess.PreprocessText(doc, StopWordList) for doc in Docs if type(doc) != unicode ] self.Dictionary = Preprocess.ConstructDictionary(WordListSet) self.W = len(self.Dictionary) print "Total number of words is: ", self.W print "Begin to save the dictionary..." self.SaveDictionary() print "Done!!" print "Begin to map the word to ID" self.IDListSet = [] inv_dict = {v: k for k, v in self.Dictionary.iteritems()} for wdl in WordListSet: IdList = Preprocess.Word2Id(wdl, inv_dict) self.IDListSet.append(IdList) print "Done!!" self.ndsum = ListUtil.Initial(self.D) self.theta = ListUtil.InitialMat(self.D, self.K, 0.0) self.phi = ListUtil.InitialMat(self.K, self.W, 0.0) self.nd = ListUtil.InitialMat(self.D, self.K, 0) self.nw = ListUtil.InitialMat(self.W, self.K, 0) self.Z = [] print "Begin to initialize the LDA model..." self.RandomAssignTopic() print "Topic assignment done!!"
def testDigits(kTup=('rbf', 10)): dataArr, labelArr = ld.loadImages('trainingDigits') b, alphas = fksmo.smoPK(dataArr, labelArr, 200, 0.0001, 10000, kTup) datMat = mat(dataArr) labelMat = mat(labelArr).transpose() svInd = nonzero(alphas.A > 0)[0] sVs = datMat[svInd] labelSV = labelMat[svInd] print "there are %d Support Vectors" % shape(sVs)[0] m, n = shape(datMat) errorCount = 0 for i in range(m): kernelEval = fksmo.kernelTrans(sVs, datMat[i, :], kTup) predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b if sign(predict) != sign(labelArr[i]): errorCount += 1 print "the training error rate is: %f" % (float(errorCount) / m) dataArr, labelArr = ld.loadImages('testDigits') errorCount = 0 datMat = mat(dataArr) labelMat = mat(labelArr).transpose() m, n = shape(datMat) for i in range(m): kernelEval = fksmo.kernelTrans(sVs, datMat[i, :], kTup) predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b if sign(predict) != sign(labelArr[i]): errorCount += 1 print "the test error rate is: %f" % (float(errorCount) / m)
def testRbf(k1=1.3): dataArr, labelArr = ld.loadDataSet('testSetRBF.txt') b, alphas = fksmo.smoPK(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1)) #C=200 important datMat = mat(dataArr) labelMat = mat(labelArr).transpose() svInd = nonzero(alphas.A > 0)[0] sVs = datMat[svInd] #get matrix of only support vectors labelSV = labelMat[svInd] print "there are %d Support Vectors" % shape(sVs)[0] m, n = shape(datMat) errorCount = 0 for i in range(m): kernelEval = fksom.kernelTrans(sVs, datMat[i, :], ('rbf', k1)) predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b if sign(predict) != sign(labelArr[i]): errorCount += 1 print "the training error rate is: %f" % (float(errorCount) / m) dataArr, labelArr = ld.loadDataSet('testSetRBF2.txt') errorCount = 0 datMat = mat(dataArr) labelMat = mat(labelArr).transpose() m, n = shape(datMat) for i in range(m): kernelEval = fksom.kernelTrans(sVs, datMat[i, :], ('rbf', k1)) predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b if sign(predict) != sign(labelArr[i]): errorCount += 1 print "the test error rate is: %f" % (float(errorCount) / m)
def getLogisticRegression (self) : """ Fit the model to the trainig data.""" ld=LoadData(self.argumentsDict) X,y=ld.loadTrainingDataSet() tm=TrainModels(X,y) model=tm.getModelLogistic() return model,ld
def getDummy (self) : """ Fit the model to the trainig data.""" ld=LoadData(self.argumentsDict) X,y=ld.loadTrainingDataSet() tm=TrainModels(X,y) model=tm.getDummy() return model,ld
def test(self): load_test = LoadData() self.folder_test = load_test.data_test self.train_generator, self.x_train, self.x_valid, self.y_train, self.y_valid = load_test.loadDataTrain( ) self.test_generator, self.x_test = load_test.loadDataTest( self.folder_test) model = load_model('./model.h5') self.test_generator.reset() pred = model.predict_generator(self.test_generator, verbose=1, steps=600 / 1) predicted_class_indices = np.argmax(pred, axis=1) labels = (self.train_generator.class_indices) labels = dict((v, k) for k, v in labels.items()) prediksi = [labels[k] for k in predicted_class_indices] path = self.test_generator.filenames filenames = [] for x in range(len(path)): filenames.append(path[x][12:len(path[x]) - 8]) true_pred = 0 compare = [] for x in range(len(filenames)): if filenames[x] == prediksi[x]: # true_pred = true_pred + 1 compare.append("False") else: true_pred = true_pred + 1 compare.append("True") row = len(self.test_generator) list_prediksi = [] for i in range(row): list_prediksi.append([filenames[i], prediksi[i], compare[i]]) # print result to console # s = "" # for i in range(row): # print(i, list_prediksi[i]) s = ''.join(prediksi[0:row]) # print(s) self.progressBar.setValue(100) self.txtLR.setText(s) persentase = (true_pred / len(filenames)) * 100 # print(persentase) self.lblHasil.setText("Tingkat Akurasi : %.2f%%" % (persentase))
def num_correct(net, test_set): with torch.no_grad(): for i, data in enumerate(test_set, 0): # Data to device (gpu) images, points = data['image'].to(device), data['points'].to(device) output = net(images) LoadData.show_batch({'image': images.to('cpu'), 'points': points.to('cpu')}, output.to('cpu')) if i == 5: break
def mainCurves(): LoadData.warning() first_X, first_Y = LoadData.contraceptiveData() first_graph_data = analyzePerNeighbor(first_X, first_Y) second_X, second_Y = LoadData.wineData() second_graph_data = analyzePerNeighbor(second_X, second_Y) graphDataCurves(first_graph_data, second_graph_data)
def getDataSet(pDict, resdir): """Load the data set""" ld = LoadData(pDict) X_train, y_train = ld.loadTrainingDataSet() X_test, dbkeys = ld.loadTestDataSet() dictML = { "X_test": X_test, "X_train": X_train, "y_train": y_train, "resultsDirectory": resdir } return dictML, dbkeys
def main(): LoadData.warning() # Building Phase first_X, first_Y = LoadData.contraceptiveData() clf_first, first_training_score, first_training_data, first_testing_data, first_graph_data = analyze( first_X, first_Y, 40) print("kNN Training Score (first) After Cross Validation: {0:.2f}%".format( first_training_score * 100)) LoadData.calc_accuracy(first_training_data[1], clf_first.predict(first_training_data[0]), first_testing_data[1], clf_first.predict(first_testing_data[0])) second_X, second_Y = LoadData.wineData() clf_second, second_training_score, second_training_data, second_testing_data, second_graph_data = analyze( second_X, second_Y) print( "kNN Training Score (second) After GridSearch Cross Validation: {0:.2f}%" .format(second_training_score * 100)) LoadData.calc_accuracy(second_training_data[1], clf_second.predict(second_training_data[0]), second_testing_data[1], clf_second.predict(second_testing_data[0])) graphData(first_graph_data, second_graph_data)
def detect_burr(data, pv, left=None, right=None, method=0, minimum_peak_distance=100): titles = data.columns titleList = titles.values.tolist() if pv in titleList: pvn = titleList.index(pv) sta = DisplayData.showStatistic(data) print("statistic data:") print(sta) # use boxplot define threshold iqr = sta.loc['75%'][titles[pvn]] - sta.loc['25%'][titles[pvn]] if left is None: left = sta.loc['25%'][titles[pvn]] - 1.5 * iqr if right is None: right = sta.loc['75%'][titles[pvn]] + 1.5 * iqr print('min edge:', left, 'max edge:', right) burrdata = data[((data[titles[pvn]]) < left) | ((data[titles[pvn]]) > right)] LoadData.df2other(burrdata, 'csv', 'newfile.csv') y = data[titles[pvn]].values if method == 0: # find_peaks by scipy signal peaks, _ = signal.find_peaks(y, height=right) plt.plot(y, 'b', lw=1) plt.plot(peaks, y[peaks], "+", mec='r', mew=2, ms=8) plt.plot(np.zeros_like(y) + right, "--", color="gray") plt.title("find_peaks min_height:%7f" % right) plt.show() if method == 1: detect_peaks(y, mph=right, mpd=minimum_peak_distance, show=True) if method == 2: print('Detect peaks with minimum height and distance filters.') # thres=right/max(y) indexes = peakutils.peak.indexes(np.array(y), thres=right / max(y), min_dist=minimum_peak_distance) print('Peaks are: %s' % (indexes)) plt.plot(y, 'b', lw=1) for i in indexes: plt.plot(i, y[i], "+", mec='r', mew=2, ms=8) plt.plot(np.zeros_like(y) + right, "--", color="gray") plt.title("peakutils.peak thres:%f ,minimum_peak_distance:%d" % (right, minimum_peak_distance)) plt.show() else: print("Wrong PV name, not in ", titleList)
def main(): #os.chdir('../') # Set working directory print("\nStarting program.\n") print("Loading data...\n") accidents_data = ld.AccidentsData() vehicles_data = ld.VehiclesData() merged_data = ld.MergedData(accidents_data, vehicles_data) X_test = merged_data.get_merged_test() y_test = merged_data.get_target_test() X_train = merged_data.get_merged_train() y_train = merged_data.get_target_train() print("Available Models:\n") print("1. K-nearest Neighbors") print("2. Stochastic Gradient Descent Classifier") print("3. Decision Tree Classifier") print("4. Random Forest Classifier") print("5. C-Support Vector Classification") print("6. Logistic Regression") print("7. Multi-Layer Perceptron Classifier") print("\n") mode = input("Choose Training Model: ") print('\nTraining model...\n') training = tr.Training(X_train, y_train) if mode == "1": training.knnTraining() elif mode == "2": training.sgdClassifierTraining() elif mode == "3": training.decisionTreeTraining() elif mode == "4": training.supportVectorMachinesTraining() elif mode == "5": training.supportVectorMachinesTraining() elif mode == "6": training.logisticRegressionTraining() elif mode == "7": training.mlpTraining() else: print("Bye!") quit() print('Calculating prediction...') y_pred = training.model.predict(X_test.drop('accident_id', axis=1)) print('F1 score = ', f1_score(y_test,y_pred))
def main(): title = "SVM Learning Curves (Contraceptive)" contracept_X, contracept_Y = LoadData.contraceptiveData() contraceptX_train, contraceptX_test, contraceptY_train, contraceptY_test = train_test_split( contracept_X, contracept_Y, test_size=0.30, random_state=100) cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42) # change the kernel here estimator = SVC(gamma=.001, C=1000.0, kernel='poly') plt, contracept_elapsed_time = plot_learning_curve(estimator, title, contraceptX_train, contraceptY_train, (0.1, 0.5), cv=cv, n_jobs=4) print("It took SVM (Contraceptive) {0}s to train".format( contracept_elapsed_time)) estimator.fit(contraceptX_train, contraceptY_train) print(estimator.score(contraceptX_train, contraceptY_train)) t0 = time() y_pred = estimator.predict(contraceptX_test) print("SVM (Contraceptive) Took {0}s to test".format(time() - t0)) print("SVM Accuracy Score (Contraceptive) was {0}%".format( accuracy_score(contraceptY_test, y_pred) * 100)) plt.show() title = "SVM Learning Curves (Wine)" wine_X, wine_Y = LoadData.wineData() wineX_train, wineX_test, wineY_train, wineY_test = train_test_split( wine_X, wine_Y, test_size=0.30, random_state=100) cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42) # change the kernel here estimator = SVC(gamma=.001, C=1000.0, kernel='rbf') plt, wine_elapsed_time = plot_learning_curve(estimator, title, wineX_train, wineY_train, (0.1, 1.01), cv=cv, n_jobs=4) print("It took SVM (Wine) {0}s to train".format(wine_elapsed_time)) estimator.fit(wineX_train, wineY_train) print(estimator.score(wineX_train, wineY_train)) t0 = time() y_pred = estimator.predict(wineX_test) print("It took SVM (Wine) {0}s to test".format((time() - t0))) print("SVM Accuracy Score (Wine) was {0}%".format( accuracy_score(wineY_test, y_pred) * 100)) plt.show()
def __init__(self, batch_size, training_dataset_folder_name, total_epochs, epochs_with_same_data=5, folders_at_the_same_time=20, to_avoid=[], enable_telegram_bot=True, chat_id="undefined"): self.x, self.y, _ = LoadData.GetData( training_dataset_folder_name, limit_value=folders_at_the_same_time, to_avoid=to_avoid) self.y = np_utils.to_categorical(self.y, 2) self.x = self.x.astype('float32') self.x /= np.max(self.x) self.x_next_epoch, self.y_next_epoch = self.x, self.y self.epoch = 0 self.batch_size = batch_size self.epochs_with_same_data = epochs_with_same_data self.training_dataset_folder_name = training_dataset_folder_name self.folders_at_the_same_time = folders_at_the_same_time self.to_avoid = to_avoid self.steps_per_epoch = 0 self.t = None self.total_epochs = total_epochs self.enable_telegram_bot = enable_telegram_bot self.chat_id = chat_id
def master(k): DataFile = './InputData/tmp.txt' x, y, m = LoadData.load("./InputData/origin.txt") LoadData.store(x, y, DataFile) _x, _y = k_means.getResult(DataFile, k = k, Flag = False) #Debug.show(_x[1], _y[1]) Ans = [] for master in _x: e = connector.transport(_x[master], _y[master], m) #print _x[master] #print _y[master] #print 'e =', e Ans.append([e, _x[master], _y[master]]) return Ans
def AgeHist(): df = LoadData.readDataSet() df_age = df['Age'] df_age_normal = df_age[df['label'] == 0] df_age_normal.hist( bins=40, grid=False).get_figure().savefig('D://tmsc_data/Age_distribution.png') Dict = {} for age in df_age_normal: if Dict.has_key(age): Dict[age] += 1 else: Dict[age] = 1 Dict = sorted(Dict.items(), key=lambda d: d[0], reverse=False) keylist = [] vallist = [] for key, val in Dict: keylist.append(key) vallist.append(val) print Dict
def AgeSection(): df = LoadData.readDataSet() df_features = df.drop(['label', 'id'], axis=1) df_features = df_features[df['label'] == 0] df_age56to59 = df_features[df['Age'] <= 59] # 10 rows df_age56to59 = df_age56to59.drop('Age', axis=1) df_age60to70 = df_features[df['Age'] <= 70] # 435 rows df_age60to70 = df_age60to70[df['Age'] >= 60] df_age60to70 = df_age60to70.drop('Age', axis=1) df_age71to82 = df_features[df['Age'] <= 82] # 2092 rows df_age71to82 = df_age71to82[df['Age'] >= 71] df_age71to82 = df_age71to82.drop('Age', axis=1) df_age83to90 = df_features[df['Age'] >= 83] # 515 rows df_age83to90 = df_age83to90[df['Age'] <= 90] df_age83to90 = df_age83to90.drop('Age', axis=1) df_age91to96 = df_features[df['Age'] >= 91] # 24 rows df_age91to96 = df_age91to96.drop('Age', axis=1) return df_age56to59, df_age60to70, df_age71to82, df_age83to90, df_age91to96
def build_model(phi, restore=False): # pre-process Xinput, Xoutput, Phi, PhiT, Yinput = LD.pre_calculate(phi) # build model prediction, predictionSymmetric, transField = build_fista(Xinput, Phi, PhiT, Yinput, reuse=False) # loss function costMean, costSymmetric, costSparsity = compute_cost( prediction, predictionSymmetric, Xoutput, transField) costAll = costMean + 0.01 * costSymmetric + 0.001 * costSparsity optmAll = tf.train.AdamOptimizer( learning_rate=learningRate).minimize(costAll) # set up init = tf.global_variables_initializer() config = tf.ConfigProto() config.gpu_options.allow_growth = True saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) sess = tf.Session(config=config) if restore is False: # training sess.run(init) return sess, saver, Xinput, Xoutput, costAll, optmAll, Yinput, prediction else: # reconstruction saver.restore(sess, '%s/%d.cpkt' % (modelDir, ncpkt)) return sess, saver, Xinput, Xoutput, Yinput, prediction
def train(args): # Data loading data = DATA.LoadData(args.path, args.dataset) if args.verbose > 0: print("AFM: dataset=%s, factors=%s, attention=%d, freeze_fm=%d, #epoch=%d, batch=%d, lr=%.4f, lambda_attention=%.1e, keep=%s, optimizer=%s, batch_norm=%d, decay=%f, activation=%s" %(args.dataset, args.hidden_factor, args.attention, args.freeze_fm, args.epoch, args.batch_size, args.lr, args.lamda_attention, args.keep, args.optimizer, args.batch_norm, args.decay, args.activation)) activation_function = tf.nn.relu if args.activation == 'sigmoid': activation_function = tf.sigmoid elif args.activation == 'tanh': activation_function == tf.tanh elif args.activation == 'identity': activation_function = tf.identity save_file = make_save_file(args) # Training t1 = time() num_variable = data.truncate_features() if args.mla: args.freeze_fm = 1 model = AFM(data.features_M, args.pretrain, save_file, args.attention, eval(args.hidden_factor), args.valid_dimen, activation_function, num_variable, args.freeze_fm, args.epoch, args.batch_size, args.lr, args.lamda_attention, eval(args.keep), args.optimizer, args.batch_norm, args.decay, args.verbose, args.mla) model.train(data.Train_data, data.Validation_data, data.Test_data) # Find the best validation result across iterations best_valid_score = 0 best_valid_score = min(model.valid_rmse) best_epoch = model.valid_rmse.index(best_valid_score) print ("Best Iter(validation)= %d\t train = %.4f, valid = %.4f [%.1f s]" %(best_epoch+1, model.train_rmse[best_epoch], model.valid_rmse[best_epoch], time()-t1))
def main(students_file, rooms_file, out_format): loader = ld.LoadJSON() students = loader.load(filename=students_file) rooms = loader.load(filename=rooms_file) db = sql_functions.DBops() db.create_table() for query in sql_queries.INDEX_QUERY: db.select_query(query) db.insert_queries(rooms, students) db.commit() for select_num, query in enumerate(sql_queries.SELECT_QUERIES): result = db.select_query(query) try: if out_format.lower() == 'json': conversion_json = cd.JSONConversion() conversion_json.write( result, 'select_' + query_name(select_num) + out_format) elif out_format.lower() == "xml": conversion_xml = cd.XMLConversion() conversion_xml.write( result, 'select_' + query_name(select_num) + out_format) else: raise ex.FormatException('Please enter format json or xml') except ex.FormatException as fe: print(fe)
def getPrecision(X_train, Y_train, X_val, Y_val): alg = RandomForestClassifier(n_estimators=50, min_samples_split=2, min_samples_leaf=1, oob_score=True) alg.fit(X_train, Y_train) # oob模型准确率评估 print('model oob_score:', alg.oob_score_) Y_predict = alg.predict_proba(X_val)[:, 1] # 参数1表示预测该样本为正类的概率 Y_predict[Y_predict <= 0.44] = 0 Y_predict[Y_predict > 0.44] = 1 precision = np.count_nonzero(Y_predict == Y_val) / len(X_val) print('model precision:', precision) # return precision # 获取特征的importance featureList = LoadData.getFeatureName('D://tmsc_data/nameListFile.txt') feature_importances = alg.feature_importances_ # 将特征名及其重要性分数对应 Dict = {} for (predictor, score) in zip(featureList, feature_importances): Dict[predictor] = score # 对importance值进行排序 Dict = sorted(Dict.items(), key=lambda d: d[1], reverse=True)
def train(FLAGS): # Data loading import pickle as pk data = DATA.LoadData(FLAGS.path, FLAGS.dataset) if FLAGS.verbose > 0: print( "FM: dataset=%s, embedding_size=%d,#epoch=%d, batch=%d, lr=%.4f, lambda=%.1e, keep=%s, metric=%s, optimizer=%s, batch_norm=%d" % (FLAGS.dataset, FLAGS.embedding_size, FLAGS.epoch, FLAGS.batch_size, FLAGS.lr, FLAGS.lamda, FLAGS.keep, FLAGS.metric, FLAGS.optimizer, FLAGS.batch_norm)) # Training t1 = time() model = FM(data.features_M, FLAGS.pretrain, make_save_file(FLAGS), FLAGS.embedding_size, FLAGS.valid_dimen, FLAGS.epoch, FLAGS.metric, FLAGS.batch_size, FLAGS.lr, FLAGS.lamda, FLAGS.keep, FLAGS.optimizer, FLAGS.batch_norm, FLAGS.verbose) model.train(data.Train_data, data.Validation_data, data.Test_data) # Find the best validation result across iterations best_valid_score = 0 best_valid_score = min(model.valid_rmse) best_epoch = model.valid_rmse.index(best_valid_score) print("Best Iter(validation)= %d\t train = %.4f, valid = %.4f [%.1f s]" % (best_epoch + 1, model.train_rmse[best_epoch], model.valid_rmse[best_epoch], time() - t1))
def prepData(): # load up files from disk training_data, kaggle_data = LoadData.load_data() features_in = [ 'Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address' 'X', 'Y' ] # break dates into month, day, year, day of week, hour # categorize category, month, day, year, dow, hour, district # scale lat (y), long(x) training_data['Year'] = (pd.DatetimeIndex(training_data['Dates']).year) training_data['Month'] = (pd.DatetimeIndex(training_data['Dates']).month) training_data['Day'] = (pd.DatetimeIndex(training_data['Dates']).day) training_data['Hour'] = (pd.DatetimeIndex(training_data['Dates']).hour) training_data['Minute'] = (pd.DatetimeIndex(training_data['Dates']).minute) # cast date as unix time training_data['UnixTime'] = (pd.DatetimeIndex( training_data['Dates'])).astype(np.int64) / 10000000000 # day of week to number sorted_days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday') def dayOfWeekNumber(d): return sorted_days.index(d) training_data['DayNumber'] = training_data['DayOfWeek'].apply( dayOfWeekNumber) # set up an id number for each category from alphabetical list # add to training_data categories = pd.unique(training_data['Category']) sorted_categories = (np.sort(categories)).tolist() def categoryNumber(category): return sorted_categories.index(category) training_data['CategoryNumber'] = training_data['Category'].apply( categoryNumber) districts = pd.unique(training_data['PdDistrict']) sorted_districts = (np.sort(districts)).tolist() def districtNumber(district): return sorted_districts.index(district) training_data['DistrictNumber'] = training_data['PdDistrict'].apply( districtNumber) # X is longitude, Y is latitude set ones outside city to median values training_data.loc[training_data.X > -122.0, 'X'] = training_data.X.median() training_data.loc[training_data.X < -123.0, 'X'] = training_data.X.median() training_data.loc[training_data.Y < 37.0, 'Y'] = training_data.Y.median() training_data.loc[training_data.Y > 38.0, 'Y'] = training_data.Y.median() return (training_data)
def train(args): # Data loading data = DATA.LoadData(args.path, args.dataset) if args.verbose > 0: print( "FM: dataset=%s, factors=%d, #epoch=%d, batch=%d, lr=%.4f, lambda=%.1e, keep=%.2f, optimizer=%s, batch_norm=%d" % (args.dataset, args.hidden_factor, args.epoch, args.batch_size, args.lr, args.lamda, args.keep, args.optimizer, args.batch_norm)) # Training t1 = time() model = FM(data.features_M, args.pretrain, make_save_file(args), args.hidden_factor, args.epoch, args.batch_size, args.lr, args.lamda, args.keep, args.optimizer, args.batch_norm, args.verbose, args.mla) model.train(data.Train_data, data.Validation_data, data.Test_data) # Find the best validation result across iterations best_valid_score = 0 best_valid_score = min(model.valid_rmse) best_epoch = model.valid_rmse.index(best_valid_score) print( "Best Iter(validation)= %d\t train = %.4f, valid = %.4f [%.1f s], test = %.4f [%.1f s]" % (best_epoch + 1, model.train_rmse[best_epoch], model.valid_rmse[best_epoch], model.test_rmse[best_epoch], time() - t1))
def w2v_ic(word, buckets=20): global ic_dict if not ic_dict: ic_dict = LoadData.load_ic() im = [0]*buckets cn = [0]*buckets if word in ic_dict: if ic_dict[word].IMAGEABILITY != None: im = bucket(ic_dict[word].IMAGEABILITY, 7., buckets) elif word in model: for w2 in model.most_similar(word, topn=20): if w2[0] in ic_dict and ic_dict[w2[0]].IMAGEABILITY != None: im = bucket(ic_dict[w2[0]].IMAGEABILITY, 7., buckets) break if ic_dict[word].CONCRETENESS != None: cn = bucket(ic_dict[word].CONCRETENESS, 5., buckets) elif word in model: for w2 in model.most_similar(word, topn=20): if w2[0] in ic_dict and ic_dict[w2[0]].CONCRETENESS != None: cn = bucket(ic_dict[w2[0]].CONCRETENESS, 5., buckets) break return cn + im
def ElaborateImagesAndMakePredition(self, inp_img): # crop a good percentage of the image in order to gain performances. found a good tradeoff with those values start_time = time.time() img_data_pipelined = FaceExtractionPipeline.SingletonPipeline( ).FaceExtractionPipelineImage( inp_img, math.ceil(np.shape(inp_img)[0] * 20 / 100), math.ceil(np.shape(inp_img)[0] * 40 / 100)) if img_data_pipelined is not None: # plt.imshow(img_data_pipelined, 'gray') # plt.show() inp = LoadData.MergeImages(self.ref_img, img_data_pipelined) inp = np.expand_dims(inp, axis=0) #with self.graph.as_default(): predicted_label = self.model.predict(inp) print(('same' if predicted_label[0, 1] > 0.975 else 'wrong') + str(predicted_label)) return True, predicted_label[0, 1] else: return False, 0 print("--- %s seconds for a frame---" % (time.time() - start_time))
def prepData(): # load up files from disk training_data, kaggle_data = LoadData.load_data() features_in = ['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address' 'X', 'Y'] # break dates into month, day, year, day of week, hour # categorize category, month, day, year, dow, hour, district # scale lat (y), long(x) training_data['Year'] = (pd.DatetimeIndex(training_data['Dates']).year) training_data['Month'] = (pd.DatetimeIndex(training_data['Dates']).month) training_data['Day'] = (pd.DatetimeIndex(training_data['Dates']).day) training_data['Hour'] = (pd.DatetimeIndex(training_data['Dates']).hour) training_data['Minute'] = (pd.DatetimeIndex(training_data['Dates']).minute) # cast date as unix time training_data['UnixTime'] = (pd.DatetimeIndex(training_data['Dates'])).astype(np.int64) / 10000000000 # day of week to number sorted_days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday') def dayOfWeekNumber(d): return sorted_days.index(d) training_data['DayNumber'] = training_data['DayOfWeek'].apply(dayOfWeekNumber) # set up an id number for each category from alphabetical list # add to training_data categories = pd.unique(training_data['Category']) sorted_categories = (np.sort(categories)).tolist() def categoryNumber(category): return sorted_categories.index(category) training_data['CategoryNumber'] = training_data['Category'].apply(categoryNumber) districts = pd.unique(training_data['PdDistrict']) sorted_districts = (np.sort(districts)).tolist() def districtNumber(district): return sorted_districts.index(district) training_data['DistrictNumber'] = training_data['PdDistrict'].apply(districtNumber) # X is longitude, Y is latitude set ones outside city to median values training_data.loc[training_data.X > -122.0, 'X'] = training_data.X.median() training_data.loc[training_data.X < -123.0, 'X'] = training_data.X.median() training_data.loc[training_data.Y < 37.0, 'Y'] = training_data.Y.median() training_data.loc[training_data.Y > 38.0, 'Y'] = training_data.Y.median() return (training_data)
def test() : import LoadData trainMat, classLabelVector = LoadData.loadTrainDataFromCSV(TRAIN_FILE) trainMat = array(trainMat) testMat = LoadData.loadTestDataFromCSV(TEST_FILE) k = 3 # testMat = trainMat[0:50] # i = 0 # for testData in testMat : # label = classify_kNN(testData, trainMat[50:], classLabelVector[50:], k) # print "the real answer is ", classLabelVector[i], "the label is ", label # i += 1 for testData in testMat : label = classify_kNN(testData, trainMat, classLabelVector, k)
def theanoScatterPCA(path, dataset): if dataset == 'mnist': print('Loading Mnist Data') (imageData, imageLabels) = LoadData.loadMNISTUnSplit(path, shared=False) print(imageData.shape) elif dataset == 'cifar': print('Loading Cifar Data') (imageData, imageLabels) = LoadData.loadCIFAR10UnSplit(path, shared=False) imageData = imageData / 255. print('Loaded') print("Computing Scatter Plot") labelIds = dict() for idx in range(len(imageLabels)): if str(imageLabels[idx]) not in labelIds: labelIds[str(imageLabels[idx])] = [] labelIds[str(imageLabels[idx])].append(idx) fig, plots = plt.subplots(10, 10) fig.set_size_inches(50, 50) plt.prism() for i, j in product(xrange(10), repeat=2): if i > j: continue idx = labelIds[str(i)] + labelIds[str(j)] print('\tCalculating PCA For Classes %d And %d' %(i,j)) X_transformed = runPCA(data=imageData, elems=idx, components=2) Y_ = imageLabels[labelIds[str(i)] + labelIds[str(j)]] plots[i, j].scatter(X_transformed[:, 0], X_transformed[:, 1], c=Y_) plots[i, j].set_xticks(()) plots[i, j].set_yticks(()) plots[j, i].scatter(X_transformed[:, 0], X_transformed[:, 1], c=Y_) plots[j, i].set_xticks(()) plots[j, i].set_yticks(()) if i == 0: plots[i, j].set_title(j) plots[j, i].set_ylabel(j) plt.tight_layout() plt.savefig('scatter/' + dataset + ".png") print("Computing Scatter Plot Finished")
def UpdateStocks(getSyms=False, getQuotes=False): global SYMBOLS if getSyms is True: LoadData.downloadSymbols() if getQuotes is True: LoadData._downloadStocks('nasdaq') LoadData._downloadStocks('nyse') SYMBOLS = LoadData._getSymbols()
def AS(sym=None): global SYMBOLS UpdateStocks() if len(SYMBOLS) < 1 and sym is None: print("Not enough stocks loaded.") return # If a single stock symbol has been passed, only test that symbol. if isinstance(sym, str): print("Will test: " + sym) tester = PatProcess.Test(LoadData.historicalData(sym)) tester.parse() # Otherwise, test them all. else: for s in SYMBOLS[1:20]: print("Will test:" + s[0])
def ic(word, buckets=5): if type(word) == tuple: word = word[0] global ic_dict if not ic_dict: ic_dict = LoadData.load_ic() im = [0]*buckets cn = [0]*buckets if word in ic_dict: if ic_dict[word].IMAGEABILITY != None: im = bucket(ic_dict[word].IMAGEABILITY, 7., buckets) if ic_dict[word].CONCRETENESS != None: cn = bucket(ic_dict[word].CONCRETENESS, 5., buckets) return cn + im
def predict(): parser = argparse.ArgumentParser(prog='Logistic Regression', conflict_handler='resolve',description = '''\ This scripts predicts the classes according to a previously saved model of the provided dataset and saves it to the given output folder ''') parser.add_argument('-o', '--output', type=str, default="out", required=False, help='Path To The Output Folder') requiredNamed = parser.add_argument_group('Required Arguments') requiredNamed.add_argument('-m', '--model', type=str, required=True, help='The Previously Trained Model') requiredNamed.add_argument('-d', '--dataset', type=str, required=False, help='Path To The Dataset [MNIST]') parsed = parser.parse_args() if not os.path.exists(parsed.output): os.makedirs(parsed.output) params = loadParams(parsed.model) (train_images, train_labels), (validation_images, validation_labels), \ (test_images, test_labels) = LoadData.loadMNIST(parsed.dataset) regressor = LogisticRegressor(input=test_images,labels=None, weights=params[0], bias=params[1]) predict = theano.function( inputs=[], outputs=regressor.predictions ) predictions = predict() hits = (predictions == test_labels.eval()).sum() accuracy = float(hits) / len(predictions) print('Num Predictions:\t%d' %(len(predictions))) print('Num Hits:\t\t%d' %(hits)) print('Accuracy:\t\t%f' %(accuracy)) out='' for idx in range(len(predictions)): out = out + str(predictions[idx]) + '\t' if (idx % 10) == 0: out = out[:-1] + '\n' with open(parsed.output + '/predictions.txt', 'w') as outf: outf.write(out) outf.close()
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, dataset='mnist', batch_size=20, n_hidden=500): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = Ld.load_share(dataset) if dataset=="mnist": im_sz = [28,28] num_label = 10 patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant elif dataset=="emotion": im_sz = [48,48] num_label = 10 patience = 5000 patience_increase = 2 improvement_threshold = 0.995 test_set_x, test_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] train_set_x, train_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = numpy.random.RandomState(1234) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=im_sz[0] * im_sz[1], n_hidden=n_hidden, n_out=num_label) # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = classifier.negative_log_likelihood(y) \ + L1_reg * classifier.L1 \ + L2_reg * classifier.L2_sqr # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size]}) validate_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size]}) # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in classifier.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a list of # (variable, update expression) pairs updates = [] # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] for param, gparam in zip(classifier.params, gparams): updates.append((param, param - learning_rate * gparam)) # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def train_autoencoder(): ## parses the provided parameters according to the command line input parser = argparse.ArgumentParser(prog='AutoEncoder', conflict_handler='resolve',description = '''\ This script should enable the user to train his AutoEncoder according to the input parameters ''') parser.add_argument('-l', '--learningrate', type=float, default=0.025, required=False, help='The Learning Rate') parser.add_argument('-b', '--batchsize', type=int, default=20, required=False, help='Batch Size For Training') parser.add_argument('-h', '--reducedUnits', type=int, default=30, required=False, help='Number of Reduced Layer Units') parser.add_argument('-o', '--output', type=str, default="out", required=False, help='Path To The Output Folder') parser.add_argument('-1', '--l1reg', type=float, default=0.1, required=False, help='Value For L1 Regularisaion') parser.add_argument('-k', '--kul_leib_penalty', type=float, default=0.04, required=False, help='Value For Kullback Leiber Divergence Penalty') parser.add_argument('-k', '--kul_leib_beta', type=float, default=1.0, required=False, help='Controls The Weight Of The Sparsity Penalty Term') parser.add_argument('-s', '--sparsity', type=str, default='l1reg', choices=['l1reg', 'kul_leib'], required=False, help='Choose Which Penalty Should Be Used') parser.add_argument('-e', '--epochs', type=int, default=500, required=False, help='Number Of Epochs') parser.add_argument('-m', '--momentum', type=float, default=0.9, required=False, help='The Momentum Rate') requiredNamed = parser.add_argument_group('Required Arguments') requiredNamed.add_argument('-d', '--dataset', type=str, required=True, help='Path To The Training Set (MNIST)') parsed = parser.parse_args() if parsed.sparsity == 'kul_leib': assert parsed.kul_leib_penalty < 0.05 outpath_raw = parsed.output + "/kul_leib" else: outpath_raw = parsed.output + "/l1reg" if not os.path.exists(outpath_raw): os.makedirs(outpath_raw) (train_images, train_labels), (validation_images, validation_labels), \ (test_images, test_labels) = LoadData.loadMNIST(parsed.dataset)#, shuffle=True) number_train_images_batches = train_images.get_value(borrow=True).shape[0] // parsed.batchsize number_test_images_batches = test_images.get_value(borrow=True).shape[0] // parsed.batchsize number_validation_images_batches = validation_images.get_value(borrow=True).shape[0] // parsed.batchsize index = T.lscalar() imageData = T.matrix('imageData') rng = np.random.RandomState(1234)##numpy random range generator autoencoder = AutoEncoder( input=imageData, rng=rng, n_input=28*28, ##image 28x28 n_reduced=parsed.reducedUnits, sparsity_param=parsed.kul_leib_penalty, beta=parsed.kul_leib_beta, n_reconstructed=28*28 ) if parsed.sparsity == 'l1reg': cost_sparse = ( autoencoder.cost + parsed.l1reg * abs(autoencoder.reducedLayer.weights).sum() ) else: cost_sparse = ( autoencoder.cost + autoencoder.kul_leib ) updates = ( gradient_updates_momentum(cost_sparse, autoencoder.params, parsed.learningrate, parsed.momentum) ) trainBatchGivenIndex = theano.function( inputs=[index], outputs= cost_sparse, updates= updates, givens={ imageData: train_images[index * parsed.batchsize: (index + 1) * parsed.batchsize] } ) validateBatchGivenIndex = theano.function( inputs=[index], outputs= cost_sparse, givens={ imageData: validation_images[index * parsed.batchsize: (index + 1) * parsed.batchsize] } ) patience = 5000 patience_increase = 2 improvement_threshold = 0.995 best_validation_loss = np.inf best_validation_epoch = 0 val_freq = min(number_train_images_batches, patience // 2) epoch = 0 # improvement_threshold = 0.995 # lowest_cost = np.inf # best_minibatch = -1 # best_epoch = -1 encoder_name = None if parsed.sparsity == 'l1reg': encoder_name = 'encoder_' + str(parsed.l1reg) + '_l1' else: encoder_name = 'encoder_' + str(parsed.kul_leib_beta) + '_kul_leib' done_looping = False while (epoch < parsed.epochs) and not (done_looping): epoch = epoch + 1 for minibatch_index in range(number_train_images_batches): minibatch_squared_error_loss = trainBatchGivenIndex(minibatch_index) idx = (epoch - 1) * number_train_images_batches + minibatch_index if (idx + 1) % val_freq == 0: validation_losses = [validateBatchGivenIndex(currentValidationBatch) for currentValidationBatch in range(number_validation_images_batches)] this_validation_loss = np.mean(validation_losses) print("Epoch %d, Batch Index: %d / %d, Accuracy On Validation Samples: %f" \ % (epoch, minibatch_index, number_train_images_batches, this_validation_loss)) if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, idx * patience_increase) best_validation_epoch = epoch autoencoder.save(outpath_raw, encoder_name) lowest_cost = this_validation_loss best_validation_loss = this_validation_loss best_epoch = epoch best_minibatch = minibatch_index if patience <= idx: done_looping = True break print('Saved Model With Respect To Epoch %d , Minibatch %d And Cost Of %f' % \ (best_epoch, best_minibatch, lowest_cost)) reconstruct_images = theano.function( inputs=[], outputs=autoencoder.reconstruction, givens={ imageData: test_images[:100] } ) reconstructed_images = reconstruct_images() reconstructed_images.reshape(100,28,28)# * 255 outpath = None if parsed.sparsity == 'l1reg': outpath = outpath_raw + '/reconstruct_' + str(parsed.l1reg) + '_l1.png' else: outpath = outpath_raw + '/reconstruct_' + str(parsed.kul_leib_beta) + '_kul_leib.png' arraysToImgs(rows=10,colums=10,arr=reconstructed_images,path=outpath,out_shape=(28,28))
def prepData(): # load up files from disk training_data, kaggle_data = LoadData.load_data() features_in = ['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address' 'X', 'Y'] # break dates into month, day, year, day of week, hour # categorize category, month, day, year, dow, hour, district # scale lat (y), long(x) training_data['Year'] = (pd.DatetimeIndex(training_data['Dates']).year - 2000) training_data['Month'] = (pd.DatetimeIndex(training_data['Dates']).month) training_data['Day'] = (pd.DatetimeIndex(training_data['Dates']).day) training_data['Hour'] = (pd.DatetimeIndex(training_data['Dates']).hour) training_data['Minute'] = (pd.DatetimeIndex(training_data['Dates']).minute) kaggle_data['Year'] = (pd.DatetimeIndex(kaggle_data['Dates']).year - 2000) kaggle_data['Month'] = (pd.DatetimeIndex(kaggle_data['Dates']).month) kaggle_data['Day'] = (pd.DatetimeIndex(kaggle_data['Dates']).day) kaggle_data['Hour'] = (pd.DatetimeIndex(kaggle_data['Dates']).hour) kaggle_data['Minute'] = (pd.DatetimeIndex(kaggle_data['Dates']).minute) # cast date as unix time training_data['UnixTime'] = (pd.DatetimeIndex(training_data['Dates'])).astype(np.int64) / 10000000000 kaggle_data['UnixTime'] = (pd.DatetimeIndex(kaggle_data['Dates'])).astype(np.int64) / 10000000000 # day of week to number sorted_days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday') def dayOfWeekNumber(d): return sorted_days.index(d) training_data['DayNumber'] = (training_data['DayOfWeek'].apply(dayOfWeekNumber)) kaggle_data['DayNumber'] = (kaggle_data['DayOfWeek'].apply(dayOfWeekNumber)) # set up an id number for each category from alphabetical list # add to training_data categories = pd.unique(training_data['Category']) sorted_categories = (np.sort(categories)).tolist() def categoryNumber(category): return sorted_categories.index(category) training_data['CategoryNumber'] = training_data['Category'].apply(categoryNumber) # no categories for validation data, that's what we're trying to figure out # add output array for validation set just for convience kaggle_data['CategoryNumber'] = 0 # scale lat and long def scaleLat(lat): return lat - 37.0 training_data['ScaledLatitude'] = training_data['Y'].apply(scaleLat) kaggle_data['ScaledLatitude'] = kaggle_data['Y'].apply(scaleLat) def scaleLong(long): return long + 122.0 training_data['ScaledLongitude'] = training_data['X'].apply(scaleLong) kaggle_data['ScaledLongitude'] = kaggle_data['X'].apply(scaleLong) districts = pd.unique(training_data['PdDistrict']) sorted_districts = (np.sort(districts)).tolist() def districtNumber(district): return sorted_districts.index(district) training_data['DistrictNumber'] = (training_data['PdDistrict'].apply(districtNumber)) / 9. kaggle_data['DistrictNumber'] = (kaggle_data['PdDistrict'].apply(districtNumber)) / 9. # split inputs from outputs features = ['Year', 'Month', 'Day', 'Hour', 'DayNumber', 'DistrictNumber', 'ScaledLatitude', 'ScaledLongitude'] training_x = training_data[features] training_y = training_data['CategoryNumber'] kaggle_x = kaggle_data[features] kaggle_y = kaggle_data['CategoryNumber'] # create a testing and validation set from the training_data x_train, x_split, y_train, y_split = cross_validation.train_test_split(training_x, training_y, test_size=0.2) x_test, x_validate, y_test, y_validate = cross_validation.train_test_split(x_split, y_split, test_size=0.5) # convert from dataframe to arrays of arrays train_x = x_train.as_matrix() test_x = x_test.as_matrix() validate_x = x_validate.as_matrix() x_kaggle = kaggle_x.as_matrix() y_train = y_train.as_matrix() y_test = y_test.as_matrix() y_validate = y_validate.as_matrix() kaggle_y = kaggle_y.as_matrix() # package them up training_set = (train_x, y_train) validation_set = (validate_x, y_validate) test_set = (x_test, y_test) kaggle_set = (x_kaggle, kaggle_y) print (training_x.head()) print(len(kaggle_y)) return training_set, validation_set, test_set, kaggle_set
def test_cnn(dataset_matrix_r, label_vector_r, learning_rate=0.1, n_epochs=120, nkerns=[30, 90], batch_size=250): # Load dataset datasets = LoadData.load_data_multi(dataset_matrix_r, label_vector_r) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size # Construct the model print "... building the model" index = T.lscalar() x = T.matrix("x") y = T.ivector("y") rng = np.random.RandomState(1234) layer0_input = x.reshape((batch_size, 5, 5, 10)) layer0_input = layer0_input.dimshuffle(0, 3, 1, 2) layer0 = ConvPoolLayer.ConvPoolLayer( rng, layer0_input, filter_shape=(nkerns[0], 10, 3, 3), image_shape=(batch_size, 10, 5, 5) ) layer1 = ConvPoolLayer.ConvPoolLayer( rng, layer0.output, filter_shape=(nkerns[1], nkerns[0], 3, 3), image_shape=(batch_size, nkerns[0], 3, 3) ) layer3 = MultiLayerPerceptron.HiddenLayer(rng, layer1.output.flatten(2), nkerns[1], 120, activation=T.tanh) layer5 = LogisticLayer.LogisticLayer(layer3.output, 120, 9) cost = layer5.negative_log_likelihood(y) # Function to train the model params = layer5.params + layer3.params + layer1.params + layer0.params gparams = T.grad(cost, params) updates = [(param, param - learning_rate * gparam) for param, gparam in zip(params, gparams)] train_model = theano.function( inputs=[index], outputs=[cost], updates=updates, givens={ x: train_set_x[index * batch_size : (index + 1) * batch_size], y: train_set_y[index * batch_size : (index + 1) * batch_size], }, ) # Functions to test and validate the model valid_model = theano.function( inputs=[index], outputs=[layer5.errors(y)], givens={ x: valid_set_x[index * batch_size : (index + 1) * batch_size], y: valid_set_y[index * batch_size : (index + 1) * batch_size], }, ) test_model = theano.function( inputs=[index], outputs=[layer5.errors(y)], givens={ x: test_set_x[index * batch_size : (index + 1) * batch_size], y: test_set_y[index * batch_size : (index + 1) * batch_size], }, ) print "... training the model" patience = 10000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = np.inf best_iter = 0 test_score = 0.0 start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print "training @ iter = ", iter train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: validation_losses = [valid_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print ( "epoch %i, minibatch %i/%i, validation error %f %%" % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.0) ) if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) print ( (" epoch %i, minibatch %i/%i, test error of " "best model %f %%") % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.0) ) if patience <= iter: done_looping = True break end_time = time.clock() print ("Optimization complete.") print ( "Best validation score of %f %% obtained at iteration %i, " "with test performance %f %%" % (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0) ) print >> sys.stderr, ( "The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0) ) return params
best_validation_loss = this_validation_loss best_iter = iter test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) print ( (" epoch %i, minibatch %i/%i, test error of " "best model %f %%") % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.0) ) if patience <= iter: done_looping = True break end_time = time.clock() print ("Optimization complete.") print ( "Best validation score of %f %% obtained at iteration %i, " "with test performance %f %%" % (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0) ) print >> sys.stderr, ( "The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0) ) return params if __name__ == "__main__": dataset_matrix, label_vector, dataset_matrix_r, label_vector_r = LoadData.preprocess_data() params = test_cnn(dataset_matrix_r, label_vector_r)
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=20, n_hidden=500): # Load dataset datasets = LoadData.load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size # Construct the model print '... building the model' index = T.lscalar() x = T.matrix('x') y = T.ivector('y') rng = np.random.RandomState(1234) classifier = MultiLayerPerceptron.MLP(rng, x, 28*28, n_hidden, 10) cost = classifier.negative_log_likelihood(y) + L1_reg*classifier.L1 + L2_reg*classifier.L2 # Function to train the model gparams = [T.grad(cost, param) for param in classifier.params] updates = [(param, param - learning_rate*gparam) for param, gparam in zip(classifier.params, gparams)] train_model = theano.function(inputs=[index], outputs=[cost], updates=updates, givens={x:train_set_x[index*batch_size: (index+1) * batch_size], y:train_set_y[index*batch_size: (index+1) * batch_size]}) # Functions to test and validate the model valid_model = theano.function(inputs=[index], outputs=[classifier.errors(y)], givens={x:valid_set_x[index * batch_size: (index+1) * batch_size], y:valid_set_y[index * batch_size: (index+1) * batch_size]}) test_model = theano.function(inputs=[index], outputs=[classifier.errors(y)], givens={x:test_set_x[index * batch_size: (index+1) * batch_size], y:test_set_y[index * batch_size: (index+1) * batch_size]}) # Train the model print 'Training the model ...' patience = 10000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): train_model(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = [valid_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.) ) if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
__author__ = 'computer' import sys import numpy as np import SGD import LoadData dataDic, labelDic = LoadData.loadData("horseColicTraining.txt") w = SGD.gd(dataDic, labelDic, alpha = 0.001, epochs = 1000) #w = SGD.sgd(dataDic, labelDic, alpha = 0.001, epochs = 500) dataDic, labelDic = LoadData.loadData("horseColicTest.txt") h = np.mat(dataDic).dot(w) cnt = 0 for i in range(len(labelDic)): if h[i] >= 0.5 and labelDic[i] >= 0.5: cnt += 1 elif h[i] < 0.5 and labelDic[i] <= 0.5 : cnt += 1 print(cnt / len(labelDic))
# Kaggle MNIST submissions # started with .... # http://neuralnetworksanddeeplearning.com/chap1.html source code from "Neural Networks and Deep Learning" Nielsen ################################################################################## ################################################################################## # to do # improve network # maybe better stats and some graphical output # read in the data files and format as needed import LoadData training_data, validation_data, test_data, kaggle_data = LoadData.load_data_wrapper() ########## multi layer network ###################################################### #~ 96.8% accurate first pass import MultiLayer # create the network net = MultiLayer.Network([784, 120, 60, 10]) # layer sizes ( input, hidden, output ) epochs = 30 # number of passes through full data set batch_size = 5 # size of batches, network updated once per batch alpha = 1.2 # learning step lmbda = 0.00005 # regularization net.sgd(training_data, epochs, batch_size, alpha, lmbda, test_data=test_data) # train epochs, batch size, alpha
""" Created on Dec 5, 2015 @author: Joe """ import LoadData as ld import numpy as np import pandas as pd import HFModel as hf from sklearn.cross_validation import train_test_split from sklearn import metrics if __name__ == "__main__": # H3_Test = ld.loadData('data/H3/Testing_01_21_1358755201.mat') H3 = ld.loadData("data/H3/Tagged_Training_07_30_1343631601.mat") hf.dataPrep(H3.HF, np.array(H3.tagInfo)) X = H3.HF.drop(["Timestamp", "Back Porch Lights"], axis=1) Y = H3.HF["Back Porch Lights"] # Set randomness so that we all get the same answer np.random.seed(841) # Split the data into train and test pieces for both X and Y X_train, X_test, Y_train, Y_test = train_test_split(X.head(2000), Y.head(2000), train_size=0.80) model = hf.HFModel(X_train, Y_train) print "Accuracy on test = %.3f" % metrics.accuracy_score(model.predict(X_test), Y_test) # print(H3.L1.head(5)) # print(ld.getApplianceData(H3.HF, H3.tagInfo).head(1))
# -*- coding: utf-8 -*- import LoadData dataset_matrix, label_vector, dataset_matrix_r, label_vector_r = LoadData.preprocess_data() datasets = LoadData.load_data_multi(dataset_matrix_r, label_vector_r)
def trainRegressor(): parser = argparse.ArgumentParser(prog='Logistic Regression', conflict_handler='resolve',description = '''\ This script should enable the user to train his Logistic Regression Model according to the input parameters ''') parser.add_argument('-l', '--learningrate', type=float, default=0.01, required=False, help='The Learning Rate') parser.add_argument('-b', '--batchsize', type=int, default=20, required=False, help='The Batch Size') parser.add_argument('-o', '--output', type=str, default="out", required=False, help='Path To The Output Folder') parser.add_argument('-e', '--epochs', type=int, default=200, required=False, help='Maximum Number Of Epochs') parser.add_argument('-p', '--plot', type=bool, default=False, required=False, help='Set To True In Order To Plot Error Curves') requiredNamed = parser.add_argument_group('Required Arguments') requiredNamed.add_argument('-d', '--dataset', type=str, required=True, help='Path To The Training Set') parsed = parser.parse_args() if not os.path.exists(parsed.output): os.makedirs(parsed.output) (train_images, train_labels), (validation_images, validation_labels), \ (test_images, test_labels) = LoadData.loadMNIST(parsed.dataset) number_train_images_batches = train_images.get_value(borrow=True).shape[0] // parsed.batchsize number_validation_images_batches = validation_images.get_value(borrow=True).shape[0] // parsed.batchsize number_test_images_batches = test_images.get_value(borrow=True).shape[0] // parsed.batchsize index = T.lscalar() imageData = T.matrix('imageData') imageLabels = T.ivector('imageLabels') regressor = LogisticRegressor(input=imageData, labels=imageLabels, n_in=28 * 28, n_out= 10) trainBatchGivenIndex = theano.function( inputs=[index], outputs= [regressor.cost], updates= [(regressor.weights, regressor.weights - parsed.learningrate * T.grad(cost=regressor.cost, wrt=regressor.weights)), (regressor.bias, regressor.bias - parsed.learningrate * T.grad(cost=regressor.cost, wrt=regressor.bias))], givens={ imageData: train_images[index * parsed.batchsize: (index + 1) * parsed.batchsize], imageLabels: train_labels[index * parsed.batchsize: (index + 1) * parsed.batchsize] } ) trainAccuracyGivenIndex = theano.function( inputs=[index], outputs=regressor.missclassified, givens={ imageData: train_images[index * parsed.batchsize: (index + 1) * parsed.batchsize], imageLabels: train_labels[index * parsed.batchsize: (index + 1) * parsed.batchsize] } ) valdiationAccuracyGivenIndex = theano.function( inputs=[index], outputs=regressor.missclassified, givens={ imageData: validation_images[index * parsed.batchsize: (index + 1) * parsed.batchsize], imageLabels: validation_labels[index * parsed.batchsize: (index + 1) * parsed.batchsize] } ) testAccuracyGivenIndex = theano.function( inputs=[index], outputs=regressor.missclassified, givens={ imageData: test_images[index * parsed.batchsize: (index + 1) * parsed.batchsize], imageLabels: test_labels[index * parsed.batchsize: (index + 1) * parsed.batchsize] } ) patience = 5000 patience_increase = 2 improvement_threshold = 0.995 best_validation_loss = np.inf best_validation_epoch = 0 best_testing_loss = np.inf best_testing_epoch = 0 test_score = 0. if parsed.plot: trainRes = [[],[]] valRes = [[],[]] testRes = [[],[]] done_looping = False val_freq = min(number_train_images_batches, patience // 2) epoch = 0 while epoch < parsed.epochs and (not done_looping): epoch = epoch + 1 for minibatch_index in range(number_train_images_batches): minibatch_avg_cost = trainBatchGivenIndex(minibatch_index) idx = (epoch - 1) * number_train_images_batches + minibatch_index if (idx + 1) % val_freq == 0: validation_losses = [valdiationAccuracyGivenIndex(currentValidationBatch) for currentValidationBatch in range(number_validation_images_batches)] this_validation_loss = np.mean(validation_losses) print("Epoch %d, Batch Index: %d / %d, Accuracy On Validation Samples: %f" \ % (epoch, minibatch_index, number_train_images_batches, (100 - this_validation_loss * 100))) if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, idx * patience_increase) best_validation_epoch = epoch best_validation_loss = this_validation_loss test_losses = [testAccuracyGivenIndex(currentTestBatch) for currentTestBatch in range(number_test_images_batches)] test_score = np.mean(test_losses) print("\tEpoch %d, Batch Index: %d / %d, Accuracy On Test Samples: %f" \ % (epoch, minibatch_index, number_train_images_batches, (100 - test_score * 100))) if test_score < best_testing_loss: print('\t\tNew Best Test Score\n\t\tSaving Network') best_testing_loss = test_score best_testing_epoch = epoch regressor.saveRegressor(parsed.output) if patience <= idx: done_looping = True break if parsed.plot: print("Collecting Accuracy After Epoch %d" % (epoch)) trainRes[1].append(np.mean([trainAccuracyGivenIndex(currentTrainBatch) \ for currentTrainBatch in range(number_train_images_batches)]) *100) valRes[1].append(np.mean([valdiationAccuracyGivenIndex(currentValidationBatch) \ for currentValidationBatch in range(number_validation_images_batches)]) *100) testRes[1].append(np.mean([testAccuracyGivenIndex(currentTestBatch) \ for currentTestBatch in range(number_test_images_batches)]) *100) trainRes[0].append(epoch) valRes[0].append(epoch) testRes[0].append(epoch) print('Optimization complete with best test score of %f %%,' % (100 - best_testing_loss * 100.)) if parsed.plot: plotError(trainRes, valRes, testRes, parsed.output, 'error.png')
import numpy as np import pandas as pd import pickle from datetime import datetime from sklearn import cross_validation from sklearn.cluster import Birch, KMeans ############################################################################ # read in csv files import LoadData # load up files from disk training_data, kaggle_data = LoadData.load_data() ############################################################################ """ # cluster locations def clusterLocations(): t_location = training_data[['X', 'Y']] k_location = kaggle_data[['X', 'Y']] clf = KMeans(n_clusters=640) # 23,104 unique addresses in training set clf.fit(t_location) training_data['Location'] = clf.predict(t_location) kaggle_data['Location'] = clf.predict(k_location)
def main(): #---------------------------PUT THE SEGMENT YOU WANT TO START PREDICTING FROM---------------------- CURRENT_SEGMENT=1 inning=1 #----------------------------------------------------- train_data=LoadData.getTraindata(inning) test_data=LoadData.getTestData(CURRENT_SEGMENT, inning) HRMods= HRModel.getHRModels(train_data) NHRMods= NHRModel.getNHRModels(train_data) #Start predicting for each segment fileFolder="../Predict EOI/Results/Final PPT/Inn"+str(inning)+"-StartSeg"+str(CURRENT_SEGMENT)+"-HR_NHR-" maeFile=fileFolder+"MAE.csv" fwriter=open(maeFile,"w") fwriter.write("Segment,HR MAE in Segment,NHR MAE in Segment,HR MAE till Segment,NHR MAE till Segment,Total MAE") fwriter.write("\n") df = pd.DataFrame(0, index=np.arange(len(test_data)), columns=['Predicted Total HR till Segment']) hr_runs = gl.SFrame(data=df) df = pd.DataFrame(0, index=np.arange(len(test_data)), columns=['Predicted Total NHR till Segment']) nhr_runs = gl.SFrame(data=df) test_data['Predicted Total HR till Segment']=hr_runs['Predicted Total HR till Segment'] test_data['Predicted Total NHR till Segment']=nhr_runs['Predicted Total NHR till Segment'] while CURRENT_SEGMENT<=10: print "--------------Segment "+ str(CURRENT_SEGMENT)+" Started-----------------" predict_HR= HRModel.getPredictedHomeRun(HRMods, train_data, CURRENT_SEGMENT, test_data) predict_NHR= NHRModel.getPredictedNHR(NHRMods, CURRENT_SEGMENT, test_data) test_data=updateHRFeatures(test_data, predict_HR) test_data=updateNHRFeatures(test_data, predict_NHR) #write to FILE filename=fileFolder+str(CURRENT_SEGMENT)+".csv" keylist=['Team','Total Matches Played','Runs Scored','Wickets Lost','Got All Out','Runs Conceded','Opponent Wickets Taken','Opponent All Out','Match Index','Batsman','Player Total Runs','Player Home Runs','Player Non Home Runs','Balls Faced','R0','W0','R1','W1','Current_HR','Current_NHR','Target','Final Runs Made','Extras','Home','Segment','Home Run Hitting Ability','Milestone Reaching Ability','Batting Average','Strike Rate','Matches Played','ClusterID','Predicted Total HR till Segment','Predicted Total NHR till Segment','Actual HR in Segment','Predicted HR in Segment','Actual NHR in Segment','Predicted NHR in Segment','Predicted Total runs till Segment'] test_data=test_data.select_columns(keylist) test_data.save(filename, format='csv') mae_string=getStatistics(train_data, test_data, CURRENT_SEGMENT) print "" print "MAE = ", mae_string fwriter.write(str(CURRENT_SEGMENT)+","+mae_string) fwriter.write("\n") CURRENT_SEGMENT=CURRENT_SEGMENT+1 print "" print "-----------Prediction Done!---------------" return
#!/python import numpy import pandas as pd import matplotlib.pyplot as plt import random import LoadData training_data, validation_data = LoadData.load_data() print("Training examples", len(training_data)) print("Validation examples", len(validation_data)) print(training_data.columns) features = ['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address' 'X', 'Y'] ################################################################################################################### # map data #################################################################################################################### # http://stackoverflow.com/questions/14329691/covert-latitude-longitude-point-to-a-pixels-x-y-on-mercator-projection def mercator_projection(latitude, longitude): scale = 100.0 x = (longitude + 180.0) * (scale / 360.0) latitude_radians = latitude * numpy.pi/180.0 y3 = numpy.log(numpy.tan(numpy.pi/4.0 + latitude_radians/2.0))
def predict_cnn(nkerns=[20, 40, 60], batch_size=200): # Load dataset datasets = LoadData.load_predict('VisionFeatures/dct12') #train_set_x, train_set_y = datasets[0] #valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] #n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size #n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size weights = sio.loadmat('weights20') layer0_W = weights['layer0_W'] layer0_b = weights['layer0_b'] layer0_b = np.reshape(layer0_b, (layer0_b.shape[1],)) layer1_W = weights['layer1_W'] layer1_b = weights['layer1_b'] layer1_b = np.reshape(layer1_b, (layer1_b.shape[1],)) layer2_W = weights['layer2_W'] layer2_b = weights['layer2_b'] layer2_b = np.reshape(layer2_b, (layer2_b.shape[1],)) layer3_W = weights['layer3_W'] layer3_b = weights['layer3_b'] layer3_b = np.reshape(layer3_b, (layer3_b.shape[1],)) layer5_W = weights['layer5_W'] layer5_b = weights['layer5_b'] layer5_b = np.reshape(layer5_b, (layer5_b.shape[1],)) # Construct the model print '... building the model' index = T.lscalar() x = T.matrix('x') y = T.ivector('y') rng = np.random.RandomState(1234) layer0_input = x.reshape((batch_size, 72, 88, 1)) layer0_input = layer0_input.dimshuffle(0, 3, 1, 2) layer0 = ConvPoolLayer.ConvPoolLayer(rng, layer0_input, filter_shape=(nkerns[0], 1, 9, 9), image_shape=(batch_size, 1, 72, 88), W=layer0_W, b=layer0_b) layer1 = ConvPoolLayer.ConvPoolLayer(rng, layer0.output, filter_shape=(nkerns[1], nkerns[0], 9, 9), image_shape=(batch_size, nkerns[0], 32, 40), W=layer1_W, b=layer1_b) layer2 = ConvPoolLayer.ConvPoolLayer(rng, layer1.output, filter_shape=(nkerns[2], nkerns[1], 5, 5), image_shape=(batch_size, nkerns[1], 12, 16), W=layer2_W, b=layer2_b) layer3 = MultiLayerPerceptron.HiddenLayer(rng, layer2.output.flatten(2), nkerns[2] * 4 * 6, 600, W=layer3_W, b=layer3_b, activation=T.tanh) layer5 = LogisticLayer.LogisticLayer(layer3.output, 600, 6, W=layer5_W, b=layer5_b) predict_model = theano.function(inputs=[index], outputs=[layer5.errors(y)], givens={x:test_set_x[index * batch_size: (index+1) * batch_size], y:test_set_y[index * batch_size: (index+1) * batch_size]}) prediction_losses = [predict_model(i) for i in xrange(n_test_batches)] this_prediction_loss = np.mean(prediction_losses) print this_prediction_loss
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600): """ Stochastic Gradient Descent for logistic Regression """ # Load dataset and create batches datasets = LoadData.load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size # Construct the model print 'Building the model ... ' index = T.iscalar('index') x = T.dmatrix('x') y = T.ivector('y') classifier = LogisticLayer.LogisticLayer(x, 28*28, 10) cost = classifier.negative_log_likelihood(y) # Function to train the model gW = T.grad(cost, classifier.W) gb = T.grad(cost, classifier.b) updates = [(classifier.W, classifier.W - learning_rate * gW), (classifier.b, classifier.b - learning_rate * gb)] train_model = theano.function(inputs=[index], outputs=[cost], updates=updates, givens={x:train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size]}) # Functions to test and validate the model valid_model = theano.function(inputs=[index], outputs=[classifier.errors(y)], givens={x:valid_set_x[index * batch_size: (index+1) * batch_size], y:valid_set_y[index * batch_size: (index+1) * batch_size]}) test_model = theano.function(inputs=[index], outputs=[classifier.errors(y)], givens={x:test_set_x[index * batch_size: (index+1) * batch_size], y:test_set_y[index * batch_size: (index+1) * batch_size]}) # Train the model print 'Training the model ...' patience = 5000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = np.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): train_model(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = [valid_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.) ) if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) print( (' epoch %i, minibatch %i/%i, test error of best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print( ( 'Optimization complete with best validation score of %f %%,' 'with test performance %f %%' ) % (best_validation_loss * 100., test_score * 100.) ) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
# Fit the training data to the Survived labels and create the decision trees forest = forest.fit(train_x,train_y) #find training and cv error trainpred = forest.predict(train_x).astype(int) cvpred = forest.predict(cv_x).astype(int) terr = 1-np.sum(trainpred == train_y)/trainpred.shape[0] cverr = 1-np.sum(cvpred == cv_y)/cvpred.shape[0] # Take the same decision trees and run it on the test data output = forest.predict(test_x).astype(int) return terr,cverr,output #load data and seperate into train and cv data_x, data_y, test_x, headings, submission = LoadData.loadcleandata() fraction = 0.66 ###MAKE PREDICTIONS FOR SUBMISSION############### ##nummodels = 100 ##predictions = np.zeros((test_x.shape[0],nummodels)) ##for i in range(nummodels): ## rseed = np.random.randint(1) ## train_x,cv_x,train_y,cv_y = sklearn.cross_validation.train_test_split(data_x,data_y,train_size=int(fraction*data_x.shape[0]),random_state=rseed) ## #select important features using randomized logreg #### rlrtrain_x,rlrcv_x,rlrtest_x = randomlr(train_x,train_y,cv_x,test_x,regp=1,alpha=0.5) #### terr,cverr,testpred = forestit(rlrtrain_x,train_y,rlrcv_x,cv_y,rlrtest_x,n_est=50) ## #train and predict ## terr,cverr,testpred = forestit(train_x,train_y,cv_x,cv_y,test_x,n_est=100) ## predictions[:,i] = testpred
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset="mnist.pkl.gz", nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = LoadData.load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix("x") # the data is presented as rasterized images y = T.ivector("y") # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print "... building the model" # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2), ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticLayer.LogisticLayer(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size : (index + 1) * batch_size], y: test_set_y[index * batch_size : (index + 1) * batch_size], }, ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size : (index + 1) * batch_size], y: valid_set_y[index * batch_size : (index + 1) * batch_size], }, ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size : (index + 1) * batch_size], y: train_set_y[index * batch_size : (index + 1) * batch_size], }, ) # end-snippet-1 ############### # TRAIN MODEL # ############### print "... training" # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0.0 start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print "training @ iter = ", iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print ( "epoch %i, minibatch %i/%i, validation error %f %%" % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.0) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print ( (" epoch %i, minibatch %i/%i, test error of " "best model %f %%") % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.0) ) if patience <= iter: done_looping = True break end_time = time.clock() print ("Optimization complete.") print ( "Best validation score of %f %% obtained at iteration %i, " "with test performance %f %%" % (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0) ) print >> sys.stderr, ( "The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0) )
import FormatData as FD import LoadData as dataLoader from random import shuffle #Get the data files print("Getting data..") positiveTweets = dataLoader.readFile('tweets/pos_tweets.txt') negativeTweets = dataLoader.readFile('tweets/neg_tweets.txt') #Make tweetsets the same size positiveSameL, negativeSameL = FD.sameSize(positiveTweets,negativeTweets) #Apply feature reduction so things like stopwords and emoticons are removed print("Reducing features..") reducedPos = [] for sentence in positiveSameL: reducedPos.append(FD.featureReduction(sentence)) reducedNeg = [] for sentence in negativeSameL: reducedNeg.append(FD.featureReduction(sentence)) trainData = [] for tweet in reducedPos: polarity = [] polarity.append(tweet) polarity.append('1') trainData.append(polarity) for tweet in reducedNeg: polarity = []
def prepData(): # load up files from disk training_data, kaggle_data = LoadData.load_data() features_in = ['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address' 'X', 'Y'] # break dates into month, day, year, day of week, hour # categorize category, month, day, year, dow, hour, district # scale lat (y), long(x) training_data['Year'] = (pd.DatetimeIndex(training_data['Dates']).year) training_data['Month'] = (pd.DatetimeIndex(training_data['Dates']).month) training_data['Day'] = (pd.DatetimeIndex(training_data['Dates']).day) training_data['Hour'] = (pd.DatetimeIndex(training_data['Dates']).hour) training_data['Minute'] = (pd.DatetimeIndex(training_data['Dates']).minute) kaggle_data['Year'] = (pd.DatetimeIndex(kaggle_data['Dates']).year) kaggle_data['Month'] = (pd.DatetimeIndex(kaggle_data['Dates']).month) kaggle_data['Day'] = (pd.DatetimeIndex(kaggle_data['Dates']).day) kaggle_data['Hour'] = (pd.DatetimeIndex(kaggle_data['Dates']).hour) kaggle_data['Minute'] = (pd.DatetimeIndex(kaggle_data['Dates']).minute) # cast date as unix time training_data['UnixTime'] = (pd.DatetimeIndex(training_data['Dates'])).astype(np.int64) / 10000000000 kaggle_data['UnixTime'] = (pd.DatetimeIndex(kaggle_data['Dates'])).astype(np.int64) / 10000000000 # day of week to number sorted_days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday') def dayOfWeekNumber(d): return sorted_days.index(d) training_data['DayNumber'] = (training_data['DayOfWeek'].apply(dayOfWeekNumber)) kaggle_data['DayNumber'] = (kaggle_data['DayOfWeek'].apply(dayOfWeekNumber)) # set up an id number for each category from alphabetical list # add to training_data categories = pd.unique(training_data['Category']) sorted_categories = (np.sort(categories)).tolist() def categoryNumber(category): return sorted_categories.index(category) training_data['CategoryNumber'] = training_data['Category'].apply(categoryNumber) # no categories for validation data, that's what we're trying to figure out # add output array for validation set just for convience kaggle_data['CategoryNumber'] = 0 print("min/max category", min(training_data['CategoryNumber']), max(training_data['CategoryNumber'])) districts = pd.unique(training_data['PdDistrict']) sorted_districts = (np.sort(districts)).tolist() def districtNumber(district): return sorted_districts.index(district) training_data['DistrictNumber'] = (training_data['PdDistrict'].apply(districtNumber)) kaggle_data['DistrictNumber'] = (kaggle_data['PdDistrict'].apply(districtNumber)) # split inputs from outputs features = ['Year', 'Month', 'Day', 'Hour', 'X', 'Y', 'DayNumber', 'DistrictNumber', 'CategoryNumber'] training_data = training_data[features] print("pre split ", len(training_data)) # split training and testing ##### to do , training and testing might contain some duplicates? how to avoid this? testing_data = training_data.sample(frac=0.2, replace=False) training_data = training_data.sample(frac=0.8, replace=False) print("post split", len(training_data)) print("test", len(testing_data)) data = np.array(training_data) x = data[:, 0:8] y = data[:, 8] dump_svmlight_file(x, y, 'train.svm') data = np.array(testing_data) x = data[:, 0:8] y = data[:, 8] dump_svmlight_file(x, y, 'test.svm') kaggle_data = kaggle_data[features] data = np.array(kaggle_data) x = data[:, 0:8] y = data[:, 8] dump_svmlight_file(x, y, 'kaggle.svm') # sanity check data print(training_data.head()) print(x[0]) print(y[0])