def clean_household(cnxn): sqlhh="select distinct HouseholdID,CountyCode,FTract,BlockGroup,City,UPPER(replace(State,'[^\W ]','')) as State,PostalCode from Household_20200501" print("Query to get the hh data in progress....") hhdf=load_data(cnxn,sqlhh) #hhdf['State']=hhdf['State'].str.upper() --Done above print("Distinct hh:", len(set(hhdf['HouseholdID'].values))) #- Now map full State to State Code states=map_state_to_Code(revert=True) hhdf=hhdf.replace({"State":states}) #- slow can refactor and later merge #- Hardcoding these three Hoseholds to change state from OH to KY """ 38663830 21 21015 21015980100 Cincinnati OH 45275 49727069 21 21015 21015980100 Cincinnati OH 45275 52218051 21 21015 21015980100 Cincinnati OH 45275 """ print("Before substitute: ",hhdf.loc[hhdf['HouseholdID'].isin([38663830,49727069,52218051])]) hhdf.loc[hhdf['HouseholdID'].isin([38663830,49727069,52218051]),'State']='KY' print("After substitute: ", hhdf.loc[hhdf['HouseholdID'].isin([38663830,49727069,52218051])]) #- now update the fips sqlfips="select Fipsstatecode,State from FipsstateMap" fips=load_data(cnxn,sqlfips) #- join fips print("Joining hh with fipsstatecode") hhfips=pd.merge(hhdf,fips,left_on='State',right_on='State',how='left') print("Distinct hhfips hh:", len(set(hhfips['HouseholdID'].values))) #hhfips.drop('state',axis=1,inplace=True) #- Slow return hhfips
def comb_hhdata_org(cnxn,addgenre=False,addsec_no=False): sqltrg="select * from TRGMap" trgmap=load_data(cnxn,sqltrg) print("TRG MAP orgids: ", len(set(trgmap['OrgID'].values))) sqlOrg="SELECT OrgID,AnnualRevenue,AnnualRevenueYear,PostalCode from Organization" organization=load_data(cnxn,sqlOrg) print("organization orgids: ", len(set(trgmap['OrgID'].values))) trg_org=pd.merge(trgmap,organization,left_on='OrgID',right_on='OrgID',how='inner') print("trg_org orgids: ", len(set(trgmap['OrgID'].values))) #- combining Orggenre if addgenre: print("add Org Genre") sqlGen="select distinct OrgID, first_value(Genre) over(partition by OrgId order by Genre ASC) as TRG_Genre from OrgGenre" orggen=load_data(cnxn,sqlGen) print("Genre available for Orgs: ", len(set(orggen['OrgID'].values))) trg_org=pd.merge(trg_org,orggen,left_on='OrgID',right_on='OrgID', how='left') if addsec_no: print("add sec_no from Orgmap") sqlorgmap="select distinct cast(TRGI as int) as TRGI, sec_no from orgmap where TRGI is not NULL" orgmap=load_data(cnxn,sqlorgmap) print("TRGI sec_no given for ", len(set(orgmap['TRGI'].values))) trg_org=pd.merge(trg_org,orgmap,left_on='OrgID',right_on='TRGI', how='left') trg_org.drop('TRGI',axis=1,inplace=True) print("Shape of final TRG ORG integ: ",trg_org.shape) return trg_org
def model_train_validation(ins_file, oos_file, classifier, var_list_filename, result_dir, output_suffix): """ train model evaluate on the train and validation data evaluate the model performance on the train and validation data """ #################### Load train and validation data #################### print 'Loading data for modeling starts ...' t0 = time.time() target_name = 'target' X, y = load_data(ins_file, var_list_filename, target_name) Xv, yv = load_data(oos_file, var_list_filename, target_name) print "Loading data done, taking ", time.time() - t0, "secs" # Train Model print '\nModel training starts...' t0 = time.time() model = classifier model.fit(X, y) print "Model training done, taking ", time.time() - t0, "secs" pickle.dump(model, open(result_dir + "model.p", 'wb')) # save model to disk # Predict Train y_pred = model.predict(X) p_pred = model.predict_proba(X) p_pred = p_pred[:, 1] # Predict Validation yv_pred = model.predict(Xv) pv_pred = model.predict_proba(Xv) pv_pred = pv_pred[:, 1] # Performance Evaluation: Train and Validation performance_eval_train_validation(y, p_pred, yv, pv_pred, result_dir, output_suffix) #################### Random Forest Feature Importance ###################### try: varlist_file = open(var_list_filename, 'rU') varlist_csv = csv.reader(varlist_file) var_list = [] for row in varlist_csv: var_list.append(row[0]) out_feat_import = open( result_dir + 'feature_import_' + str(output_suffix) + '.csv', 'wb') feat_import_csv = csv.writer(out_feat_import) var_import = zip(range(len(var_list)), var_list, model.feature_importances_) feat_import_csv.writerow(['var seq num', 'var name', 'importance']) print "RandomForest classifier, var importance was output" for row in var_import: feat_import_csv.writerow(row) except: print "Not RandomForest classifier, var importance not created"
def main(user_location, destination): """ Returns polyline of safest route :param user_location: list or tuple of latitude, longitude coordinates (lat, lng) :param destination: list or tuple of latitude, longitude coordinates(lat, lng) :return: """ utcrime = load_data() crime_weights = generate_crime_weights(utcrime.df) subregion_weights = generate_subregion_weights(utcrime, crime_weights) api_key = 'AIzaSyDmKbjLrlWQowWVzzTy_AAWsFQO4Hdbeko' cli = client.Client(key=api_key) gmap_routes = route_generator(user_location, destination, cli) point_routes = convert_to_point_routes(gmap_routes) scores = score_routes(point_routes, gmap_routes, subregion_weights) safe_route = safest_route(scores) route = gmap_routes[safe_route] if type(route) is dict: # alternative route, indexes differently than waypoint route polyline = route['overview_polyline']['points'] else: # waypoint route, indexes differently than alternative route polyline = route[0]['overview_polyline']['points'] return polyline
def my_predict(): model = load_model("model.h5") test_generator = load_data(test_dir, 1) for test_image, test_labels in test_generator: prediction = model.predict(test_image) max_index = np.argmax(prediction) # 判断类别 if max_index == 0: label = '%.2f%% ' % (prediction[0][0] * 100) + 'is a ' + str( my_labels[0]) + '.' elif max_index == 1: label = '%.2f%% ' % (prediction[0][1] * 100) + 'is a ' + str( my_labels[1]) + '.' elif max_index == 2: label = '%.2f%% ' % (prediction[0][2] * 100) + 'is a ' + str( my_labels[2]) + '.' elif max_index == 3: label = '%.2f%% ' % (prediction[0][3] * 100) + 'is a ' + str( my_labels[3]) + '.' elif max_index == 4: label = '%.2f%% ' % (prediction[0][4] * 100) + 'is a ' + str( my_labels[4]) + '.' plt.imshow(test_image[0]) plt.title(label) plt.show()
def load_test_dataset(dataset_dir, tokenizer): test_dataset = load_data(dataset_dir) # test_dataset = pd.read_csv(dataset_dir, delimiter='\t') test_label = test_dataset['label'].values # tokenizing dataset tokenized_test = tokenized_dataset(test_dataset, tokenizer) return tokenized_test, test_label
def train(): parser = argparse.ArgumentParser() # load model and tokenizer # MODEL_NAME = "bert-base-multilingual-cased" MODEL_NAME = args.model_name # "distilbert-base-multilingual-cased" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # load dataset train_dataset = load_data("../input/data/train/train.tsv") #dev_dataset = load_data("./dataset/train/dev.tsv") train_label = train_dataset['label'].values #dev_label = dev_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer) #tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) #RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # setting model hyperparameter bert_config = BertConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = 42 model = BertForSequenceClassification(bert_config) model.parameters model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( output_dir=f'./results/{MODEL_NAME}', # output directory save_total_limit=3, # number of total save model. save_steps=500, # model saving step. # num_train_epochs=4, # total number of training epochs num_train_epochs=5, # total number of training epochs learning_rate=5e-5, # learning_rate per_device_train_batch_size=16, # batch size per device during training #per_device_eval_batch_size=16, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=100, # log saving step. #evaluation_strategy='steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. #eval_steps = 500, # evaluation step. #load_best_model_at_end = True, # When set to True, the parameters save_strategy and save_steps will be ignored and the model will be saved after each evaluation. ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset #eval_dataset=RE_dev_dataset, # evaluation dataset #compute_metrics=compute_metrics # define metrics function ) # train model trainer.train()
def test_dA(learning_rate=0.1, training_epochs=5, batch_size=1, output_folder='dA_plots'): datasets = load_data() train_set_c, train_set_x = datasets[0] n_train_batches = train_set_c.get_value(borrow=True).shape[0] / batch_size index = T.lscalar() x = T.matrix('x') c = T.matrix('c') if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) rng = numpy.random.RandomState(123) da = denoising_layer( numpy_rng=rng, corrupted_input=c, input=x, n_visible=800 * 600, n_hidden=200 ) cost, updates = da.get_cost_updates(learning_rate) train_da = theano.function( [index], cost, updates=updates, givens={ c: train_set_c[index * batch_size: (index + 1) * batch_size], x: train_set_x[index * batch_size: (index + 1) * batch_size] } ) start_time = timeit.default_timer() for epoch in xrange(training_epochs): c = [] for batch_index in xrange(n_train_batches): j = train_da(batch_index) c.append(j) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = timeit.default_timer() training_time = (end_time - start_time) print "training time: " + str(training_time) image = Image.fromarray(tile_raster_images( X=da.W.get_value(borrow=True).T, img_shape=(600, 800), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters') os.chdir('../')
def run_ML_leave_one_subject_out(config, filename, question, clf, cols, return_arr=None, return_index=-1): working_directory = config['DATA_DIRECTORY'] data_X, data_y = load_data(working_directory, filename, cols, question) data = leave_one_subject_out(data_X, data_y, 'User') score = 0 score_dummy_mf = 0 score_dummy_sf = 0 dummy_clf_mf = DummyClassifier('most_frequent') dummy_clf_sf = DummyClassifier('stratified') for (training_X, training_y), (testing_X, testing_y) in data: clf.fit(training_X, training_y) dummy_clf_mf.fit(training_X, training_y) dummy_clf_sf.fit(training_X, training_y) single_score = clf.score(testing_X, testing_y) single_score_dummy_mf = dummy_clf_mf.score(testing_X, testing_y) single_score_dummy_sf = dummy_clf_sf.score(testing_X, testing_y) #print 'Single run score: ' + ("%0.2f" % single_score.mean()) #print 'Single run score (dummy most frequent): ' + ("%0.2f" % single_score_dummy_mf.mean()) #print 'Single run score (dummy stratified): ' + ("%0.2f" % single_score_dummy_sf.mean()) score = score + single_score.mean() score_dummy_mf = score_dummy_mf + single_score_dummy_mf.mean() score_dummy_sf = score_dummy_sf + single_score_dummy_sf.mean() score = round(float(score / len(data)), 2) score_dummy_mf = round(float(score_dummy_mf / len(data)), 2) score_dummy_sf = round(float(score_dummy_sf / len(data)), 2) #print 'Total score: ' + str(score) #print 'Total score (dummy most frequent): ' + str(score_dummy_mf) #print 'Total score (dummy stratified): ' + str(score_dummy_sf) if return_index == -1: return score, score_dummy_mf, score_dummy_sf else: return_arr[return_index] = (score, score_dummy_mf, score_dummy_sf)
def run(args): if args.config is not None: with open(args.config, 'r') as stream: hyper_params = load(stream, Loader=yaml.FullLoader) else: hyper_params = {} if not os.path.exists(args.output): os.makedirs(args.output) # to be done as soon as possible otherwise mlflow will not log with the proper exp. name if 'exp_name' in hyper_params: mlflow.set_experiment(hyper_params['exp_name']) # __TODO__ change the hparam that are used from the training algorithm # (and NOT the model - these will be specified in the model itself) check_and_log_hp( ['batch_size', 'optimizer', 'patience', 'architecture', 'max_epoch', 'exp_name'], hyper_params) train_loader, dev_loader = load_data(args, hyper_params) model = load_model(hyper_params) optimizer = load_optimizer(hyper_params, model) loss_fun = load_loss(hyper_params) train(model, optimizer, loss_fun, train_loader, dev_loader, hyper_params['patience'], args.output, max_epoch=hyper_params['max_epoch'], use_progress_bar=not args.disable_progressbar, start_from_scratch=args.start_from_scratch)
def callingBatchGD(player): ''' Function for loading data, split on test and train, calling batch gradient descent with data params, do the prediction and return rmse result This code appears multiple times in code so separate them in function because of redundancy :param player: string name of player, input from keyboard :return: RMSE metrics for given player ''' train_data, test_data = load_data('dataset/' + player + '.csv') x, y = collect_attributes(train_data) newB, cost_history_retval = batch_gradient_descent( x, y, B, recommended_alpha, recommended_iteration_number) x_test, y_test = collect_attributes(test_data) y_pre = x_test.dot(newB) rmse = calculate_rmse(np.array(y_pre), y_test) print("\nRMSE for player " + player + " is: " + str(rmse) + "\n") return rmse
def my_predict(): use_gpu = torch.cuda.is_available() test_data = load_data(data_dir_test, image_size=image_size, batch_size=batch_size) X_test, y_test = next(iter(test_data)) model = torch.load('model.pt') if use_gpu: model = model.cuda() if use_gpu: images = Variable(X_test.cuda()) else: images = Variable(X_test) outputs = model(images) _, predicted = torch.max(outputs.data, 1) print("Predict Label is: ", predicted.data) print("Real Label is :", y_test.data) img = torchvision.utils.make_grid(X_test) img = img.numpy().transpose([1, 2, 0]) # 转成numpy在转置 plt.imshow(img) plt.show()
def load_test_dataset(root, tokenizer): test_dataset = load_data(root + "/input/data/test/test.tsv", root) # test_dataset = load_data(root+"/input/data/test/ner_test_ver2.tsv", root) test_label = test_dataset['label'].values # tokenizing dataset tokenized_test = tokenized_dataset(test_dataset, tokenizer) return tokenized_test, test_label
def predict(): # load the saved model classifier = cPickle.load(open('best_model.pkl')) # compile a predictor function predict_model = theano.function( inputs=[classifier.input], outputs=classifier.y_pred ) # We can test it on some examples from test test dataset='mnist.pkl.gz' datasets = load_data(dataset) test_set_x, test_set_y = datasets[2] test_set_x = test_set_x.get_value() # test_set_y = test_set_y.get_value() predicted_values = predict_model(test_set_x[:1000]) print ("Predicted values for the examples in test set:") print predicted_values error_num = 0 for x in range(1000): if test_set_y.eval()[x] != predicted_values[x]: error_num += 1 # print '%d: %d & %d' %(x, test_set_y.eval()[x], predicted_values[x]) print 'error num: %d, test precision: %f %%' %(error_num, (1.0*error_num/1000)*100)
def train(starting_epoch=0): """This module sets all hyper-parametes of the model and optimisers, creates an instance of the Keras Model class and partially trains it using the trainer module. Args: starting_epoch: Specifies at which epoch do we want to start. (Integer) Returns: None """ global JUMP model = create_model() # Creates an object of Model class if starting_epoch: # In case starting_epoch is Non-zero model = load_model_weight(model, 'model_weights.pkl') (x_train, y_train, x_valid, y_valid, x_test, y_test) = load_data() print ("Training Data Shape: ", x_train.shape) print ("Testing Data Shape: ", x_test.shape) for i in range(starting_epoch, 300000, JUMP): # The paper trained to 300000 model = trainer(model, x_train, y_train, x_valid, y_valid, initial_epoch=i) #try: # save_model_weight(model, 'model_weights.pkl') #except: # print ("Cannot save the model") evaluate(model=model, x_test=x_test, y_test=y_test)
def run(args, hyper_params): """Setup and run the dataloaders, training loops, etc. Args: args (list): arguments passed from the cli hyper_params (dict): hyper parameters from the config file """ log_exp_details(os.path.realpath(__file__), args) if not os.path.exists(args.output): os.makedirs(args.output) # __TODO__ change the hparam that are used from the training algorithm # (and NOT the model - these will be specified in the model itself) logger.info('List of hyper-parameters:') check_and_log_hp( ['batch_size', 'optimizer', 'patience', 'architecture', 'max_epoch', 'exp_name'], hyper_params) train_loader, dev_loader = load_data(args, hyper_params) model = load_model(hyper_params) optimizer = load_optimizer(hyper_params, model) loss_fun = load_loss(hyper_params) train(model, optimizer, loss_fun, train_loader, dev_loader, hyper_params['patience'], args.output, max_epoch=hyper_params['max_epoch'], use_progress_bar=not args.disable_progressbar, start_from_scratch=args.start_from_scratch)
def random_forest_cv(folds): X, Y = load_data() # Create train and test data train_x, test_x = get_train_test_data(X) print( 'Train and test data for X matrix created with dimensions {} and {} respectively' .format(train_x.shape, test_x.shape)) train_y, test_y = get_train_test_data(Y) print( 'Train and test data for Y matrix created with dimensions {} and {} respectively' .format(train_y.shape, test_y.shape)) # The fit method of the estimator expects a 1d array and not a column-vector (which is what test_y is now). Change the shape of test_y to (n_samples, ) train_y = reshape_label_matrix(train_y) # Get the start time get_start_time() clf = RandomForestClassifier(n_estimators=500) # Run cross-validation for k in folds: print('Performing cross validation with {} folds'.format(k)) scores = cross_val_score(clf, train_x, train_y, cv=k) print('The final accuracy scores are {}'.format(scores)) print('Mean accuracy score for {} folds is {}'.format( k, scores.mean())) # Get the stop time get_stop_time()
def main(): csv_file = "TY_climate_2017_2018.csv" tensorboard_call_back = TensorBoard(log_dir="./log", histogram_freq=1, write_grads=True) train_data, test_data, column_name = load_data(csv_file) # column_name: TT-Avg(℃), MT-Avg(g) # train_data, test_data, _ = data_preprocessing(train_data, test_data) train_data, _ = data_preprocessing(train_data) test_data, _ = data_preprocessing(test_data) # load data x_train, y_train = create_dataset(train_data) # x_test, y_test = create_dataset(test_data) x_train = x_train.reshape(x_train.shape[0], 1, 1) # x_test = x_test.reshape(x_test.shape[0], 1, 1) # reshape data y_train = y_train.reshape(y_train.shape[0], 1, 1) # y_test = y_test.reshape(y_test.shape[0], 1, 1) # load model lstm_model = training_model() print(lstm_model.summary()) # start training lstm_model.compile(loss="mean_squared_error", optimizer="adam") lstm_model.fit(x_train, y_train, epochs=50, batch_size=32, callbacks=[tensorboard_call_back]) # save model if column_name == "TT-Avg(℃)": print(column_name) lstm_model.save(f"saved_models_tt_avg/{build_name(column_name)}") elif column_name == "MT-Avg(g)": print(column_name) lstm_model.save(f"saved_models_mt_avg/{build_name(column_name)}")
def Sa_train_test(model): # process data path = download() data = load_data(path) train_x, train_y, test_x, test_y = split( data, 2950, 3000 ) # shape=(, 10, 64, 64), just for test. Modify (2950, 3000) to about 10,000 in practice, such as (9800, 10000) # build model model = Sa_build_model() epochs = 80 # shape(, 10, 64, 64), just for test. should be close to 10000 in practice, such as (9800, 10000) model.fit( train_x, train_y, batch_size=8, epochs=epochs, verbose=2, validation_split=0.1, ) # save trained weight model.save_weights('sa_saved_weight/') # make prediction prediction = model.predict(test_x) # turn [64, 64, 1] img to [64, 64] img. Otherwise may raise an error when plot prediction = np.squeeze(prediction, 4) # shape = [batch_size, 10, 64, 64] # save result as photoes save_as_image(prediction, -1) # save the standard result, that is, test_y, as photoes # stantard = np.squeeze(test_y, 4); # save_as_image(stantard, 1) return model
def load_test_dataset(dataset_dir, tokenizer): test_dataset = load_data(dataset_dir) test_label = test_dataset['label'].values # tokenize dataset tokenized_test = tokenized_dataset(test_dataset, tokenizer) return tokenized_test, test_label
def run_knn(): data = load_data() X = data[0] Y = data[1] # Create train and test data train_x, test_x = get_train_test_data(X) print( 'Train and test data for X matrix created with dimensions {} and {} respectively' .format(train_x.shape, test_x.shape)) train_y, test_y = get_train_test_data(Y) print( 'Train and test data for Y matrix created with dimensions {} and {} respectively' .format(train_y.shape, test_y.shape)) # The fit method of the estimator expects a 1d array and not a column-vector (which is what test_y is now). Change the shape of test_y to (n_samples, ) train_y = reshape_label_matrix(train_y) # Get the start time get_start_time() knn = neighbors.KNeighborsClassifier(n_neighbors=2) knn.fit(train_x, train_y) pred = knn.predict(test_x) cm = confusion_matrix(test_y, pred) print('Confusion matrix : \n {}'.format(cm)) # Get the stop time get_stop_time()
def generate_encodings(dataset): train_path = TRAIN_PATHS[dataset] test_path = TEST_PATHS[dataset] data = load_data(train_path, test_path) for config in GENSIM_PRETRAINED_MODELS: create_encodings(dataset, config, data)
def __init__(self, source_vocab_size, target_vocab_size, SIGMA, LAMBDA, is_training): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.SIGMA = SIGMA self.LAMBDA = LAMBDA self.is_training = is_training if self.is_training: X, Y, _, _ = load_data(hp.train_file, hp.maxlen) # calc total batch count self.num_batch = len(X) // hp.batch_size # Convert to tensor X = tf.convert_to_tensor(X, tf.int32) Y = tf.convert_to_tensor(Y, tf.int32) # Create Queues input_queues = tf.train.slice_input_producer([X, Y]) # create batch queues self.x, self.y = tf.train.shuffle_batch(input_queues, num_threads=8, batch_size=hp.batch_size, capacity=hp.batch_size*64, min_after_dequeue=hp.batch_size*32, allow_smaller_final_batch=False) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self._creat_model()
def main(argv): if len(argv) < 1: print("Please mention a dataset (movielens or jester)") return k = 5 num_perturbs = 20 dataset_name = argv[0] mask = False if dataset_name == "movielens" or dataset_name == "jester" or dataset_name == "modcloth": mask = True adj_matrix = load_data(dataset_name) perturbed_matrix = perturb_matrix(adj_matrix, num_perturbs, k, mask) omega_c = 0 if mask == True: omega_c = np.count_nonzero(np.isnan(adj_matrix)) else: omega_c = np.count_nonzero(adj_matrix == 0) orig_svd = svd(adj_matrix, k, mask) perturbed_svd = svd(perturbed_matrix, k, mask) error = evaluate_error(orig_svd, perturbed_svd, adj_matrix, omega_c) print("RMSE Error:", error)
def test_dA(learning_rate=0.1, training_epochs=5, batch_size=1, output_folder='dA_plots'): datasets = load_data() train_set_c, train_set_x = datasets[0] n_train_batches = train_set_c.get_value(borrow=True).shape[0] / batch_size index = T.lscalar() x = T.matrix('x') c = T.matrix('c') if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) rng = numpy.random.RandomState(123) da = denoising_layer(numpy_rng=rng, corrupted_input=c, input=x, n_visible=800 * 600, n_hidden=200) cost, updates = da.get_cost_updates(learning_rate) train_da = theano.function( [index], cost, updates=updates, givens={ c: train_set_c[index * batch_size:(index + 1) * batch_size], x: train_set_x[index * batch_size:(index + 1) * batch_size] }) start_time = timeit.default_timer() for epoch in xrange(training_epochs): c = [] for batch_index in xrange(n_train_batches): j = train_da(batch_index) c.append(j) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = timeit.default_timer() training_time = (end_time - start_time) print "training time: " + str(training_time) image = Image.fromarray( tile_raster_images(X=da.W.get_value(borrow=True).T, img_shape=(600, 800), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters') os.chdir('../')
def main(): model = create_model() model = load_model_weight(model, "model_weights.pkl") x_train, y_train, _, _, x_test, y_test = load_data() print(x_train.shape) print(x_test.shape) evaluate(model, x_test, y_test)
def __init__(self, quote): ''' hand in quote as a string ''' self.neg_data, self.pos_data, self.pos_prob, self.neg_prob = load_data(quote) self.news_data = [] self.news_features = [] self.quote = quote self.date = ''
def load_test_dataset(dataset_dir, tokenizer, model_arc='Electra'): test_dataset = load_data(dataset_dir) test_label = test_dataset['label'].values # tokenizing dataset if model_arc == 'Electra': tokenized_test = tokenized_dataset(test_dataset, tokenizer) else: tokenized_test = roberta_tokenized_dataset(test_dataset, tokenizer) return tokenized_test, test_label
def prep_data(file_path, percentage_train): train_data, test_data = load_data(file_path, percentage_train) glove_vectors = load_word_vectors("../data/glove.6B.100d.txt", GLOVE_DIMENSION) train_X, train_y = process_rows(train_data, glove_vectors) test_X, test_y = process_rows(test_data, glove_vectors) print(train_X, len(train_X), len(train_X[0])) print(train_y, len(train_y), len(train_y[0])) return train_X, test_X, train_y, test_y
def __init__(self): tic = time.time() filename = 'Data/DQHI Data Scientist Exercise Data.xlsx' self.ld = load_data(filename) self.inap_sno = [] self.inap_orderno = [] self.run_analysis() self.ldata, self.sdata, self.cdata = [], [], [] toc = time.time() - tic print("Running time : " + str(toc))
def main(): print('convSLTM training') path = download() data = load_data(path) train_x, train_y, test_x, test_y = split( data, 2950, 3000 ) # shape(, 10, 64, 64),just for test. should be close to 10000 in practice, such as (9800, 10000) model = build_model() model = train(model, train_x, train_y) predict(model, test_x, test_y)
def load_test_dataset(dataset_dir, tokenizer): test_dataset = load_data(dataset_dir) test_label = test_dataset['label'].values # pororo ner ner = Pororo(task="ner", lang="ko") # tokenizing dataset tokenized_test = tokenized_dataset(test_dataset, tokenizer, ner) return tokenized_test, test_label
def load_test_dataset(dataset_dir, tokenizer): test_dataset = load_data(dataset_dir, dev=False) test_label = test_dataset['label'].values # tokenizing dataset tokenized_test = tokenized_dataset(test_dataset, tokenizer) print(tokenizer.convert_ids_to_tokens(tokenized_test['input_ids'][2])) return tokenized_test, test_label
def callingSVR(player): train_data, test_data = load_data('dataset/' + player + '.csv') x, y = collect_attributes(train_data) x_test, y_test = collect_attributes(test_data) predicted = svr(x, y, x_test) print(predicted) rmse = calculate_rmse(np.array(predicted), y_test) print("\n[SVR] RMSE for player " + player + " is: " + str(rmse) + "\n") return rmse
def dt(working_directory, filename, columns, question, name, max_depth=None, cross_validation_folds=5, render_tree=False, classify_maybe_as=None): data_X, data_y = load_data(working_directory, filename, columns, question, classify_maybe_as=classify_maybe_as) if max_depth: clf = tree.DecisionTreeClassifier(max_depth=max_depth) else: clf = tree.DecisionTreeClassifier() clf = clf.fit(data_X, data_y) scores = cross_validation(clf, data_X, data_y, cross_validation_folds, name) if render_tree: plot_tree(clf, name) return scores
def askopenfilename(self): self.filename = tkinter.filedialog.askopenfilename() print ("open filename : %s" %self.filename) # clean all data del orig_data[:] del group_data[:] del rule[:] for i in self.boy_tree.get_children(): self.boy_tree.delete(i) for i in self.girl_tree.get_children(): self.girl_tree.delete(i) for i in self.teacher_tree.get_children(): self.teacher_tree.delete(i) # clean all data load_data(self.filename, orig_data, rule) tmp = self.filename.replace(".xlsx", "") schoolname = re.search(r'.*\d+(.*)$', tmp).group(1) self.grouping_status.set("讀取"+ schoolname +"新生資料") self._status_school_update() self._load_boy_data() self._load_girl_data() self._load_teacher_data()
def train(): (train, y_train, test, y_test) = load_data() pipe = pipeline.Pipeline( [('csp', CSP()), ('chan_var', ChanVar()), ('svm', svm.SVC(kernel='linear'))]) # train model pipe.fit(train, y_train) # make predictions on unseen test data y_pred = pipe.predict(test) print metrics.classification_report(y_test, y_pred)
def svc(working_directory, filename, columns, question, name, cross_valudation_folds=5, C=1, kernel='rbf', classify_maybe_as=None): data_X, data_y = load_data(working_directory, filename, columns, question, classify_maybe_as=classify_maybe_as) clf = svm.SVC(C=C, kernel=kernel) return None for val in data_X.values(): data_X1.append(list(val)) data_X = list(np.transpose(data_X1)) for row in data_X: print row print '' return None data_y = data_y[question] clf = clf.fit(data_X, data_y) scores = cross_validation(clf, data_X, data_y, cross_valudation_folds, name) return scores
def load_data(self, batch_size): datasets = load_data() self.train_set_x, self.train_set_y = datasets[0] self.valid_set_x, self.valid_set_y = datasets[1] #self.test_set_x, self.test_set_y = datasets[2] self.n_train_batches = self.train_set_x.get_value(borrow=True).shape[0] / batch_size self.n_valid_batches = self.valid_set_x.get_value(borrow=True).shape[0] / batch_size #self.n_test_batches = self.test_set_x.get_value(borrow=True).shape[0] / batch_size self.batch_size = batch_size print 'train_x: ', self.train_set_x.get_value(borrow=True).shape print 'train_y: ', self.train_set_y.shape.eval() print 'valid_x: ', self.valid_set_x.get_value(borrow=True).shape print 'valid_y: ', self.valid_set_y.shape.eval()
from load_data import * import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.preprocessing import scale from sklearn.cluster import MeanShift, estimate_bandwidth from itertools import cycle ############################################################################### # Preprocessing data raw = load_data() data = scale(raw) reduced_data = PCA(n_components=2).fit_transform(data) reduced_data = scale(reduced_data) ############################################################################### # Compute clustering with MeanShift # The following bandwidth can be automatically detected using bandwidth = estimate_bandwidth(reduced_data) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(reduced_data) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels)
def rf(working_directory, filename, columns, question, name, cross_validation_folds=5, n_estimators=10, max_depth=5, min_samples_split=2, random_state=0, classify_maybe_as=None): data_X, data_y = load_data(working_directory, filename, columns, question, classify_maybe_as=classify_maybe_as) clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=random_state) clf.fit(data_X, data_y) scores = cross_validation(clf, data_X, data_y, cross_validation_folds, name) return scores
import numpy as np from load_data import * from combined_classifier import combined_classifier from sklearn.cross_validation import train_test_split from sklearn import svm from KernelRidge import * from kNN import kNN #from sklearn.kernel_ridge import KernelRidge #from sklearn.grid_search import GridSearchCV xtr, ytr = load_data() xte = load_data_test() xte = flatten(xte) xtr = flatten(xtr) ### parameter tuning (gaussian kernel + svm) #xtr1, xtr2, ytr1, ytr2 = train_test_split(xtr, ytr, test_size=0.2) ###find the best lmd and sigma with xtr1 and ytr1 #for i in [0.3,0.5,0.7]: # for j in [0.9,1.0,1.1,1.2]: # clf = KernelRidge(lmb=i, kernel = 'rbf', sigma=j) # clf_combined = combined_classifier(clf) # x_train, x_test, y_train, y_test = train_test_split(xtr1, ytr1, test_size=0.2) # clf_combined.fit(x_train,y_train) # scores = clf_combined.score(x_test,y_test) # print 'lmd:',i # print 'sigma:',j # print scores
def test_SdA(finetune_lr=0.1, pretraining_epochs=1, pretrain_lr=0.001, training_epochs=1, b_patch_filename = 'b_10_Training_patches_norm.npy', b_groundtruth_filename = 'b_Training_labels_norm.npy', b_valid_filename = 'b_10_Validation_patches_norm.npy', b_validtruth_filename = 'b_Validation_labels_norm.npy', u_patch_filename = 'u_10_Training_patches_norm.npy', u_groundtruth_filename = 'u_Training_labels_norm.npy', u_valid_filename = 'u_10_Validation_patches_norm.npy', u_validtruth_filename = 'u_Validation_labels_norm.npy', batch_size=100, n_ins = 605, n_outs = 2, hidden_layers_sizes = [1000,1000,1000],prefix = '11_11_3_G4_', corruption_levels=[0.2,0.2,0.2] ): """ Demonstrates how to train and test a stochastic denoising autoencoder. This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used in the finetune stage (factor for the stochastic gradient) :type pretraining_epochs: int :param pretraining_epochs: number of epoch to do pretraining :type pretrain_lr: float :param pretrain_lr: learning rate to be used during pre-training :type n_iter: int :param n_iter: maximal number of iterations to run the optimizer :type dataset: string :param dataset: path the the pickled dataset """ print '###########################' print 'Pretraining epochs: ', pretraining_epochs print 'Finetuning epochs: ', training_epochs print '###########################' W = [] b = [] ######################################################### ######################################################### resumeTraining = False #@@@@@@@@ Needs to be worked on @@@@@@@@@@@@@@@@@ # Snippet to resume training if the program crashes halfway through # opts, arg = getopt.getopt(sys.argv[1:],"rp:") for opt, arg in opts: if opt == '-r': resumeTraining = True # make this true to resume training from saved model elif opt == '-p': prefix = arg flagValue = 1 if(resumeTraining): flagFile = file(prefix+'flag.pkl','rb') try: flagValue = cPickle.load(flagFile) except: pass savedModel_preTraining = file(prefix+'pre_training.pkl','rb') genVariables_preTraining = cPickle.load(savedModel_preTraining) layer_number, epochs_done_preTraining, mean_cost , pretrain_lr = genVariables_preTraining epoch_flag = 1 print 'Inside resumeTraining!!!!!!!!!!!!!!!!!!' no_of_layers = len(hidden_layers_sizes) + 1 for i in xrange(no_of_layers): try: W.append(cPickle.load(savedModel_preTraining)) b.append(cPickle.load(savedModel_preTraining)) except: W.append(None) b.append(None) if flagValue is 2: epochFlag_fineTuning = 1 iterFlag = 1 savedModel_fineTuning = file(prefix+'fine_tuning.pkl','rb') hidden_layers_sizes = cPickle.load(savedModel_fineTuning) genVariables_fineTuning = cPickle.load(savedModel_fineTuning) epochs_done_fineTuning,best_validation_loss,finetune_lr,patience,iters_done = genVariables_fineTuning else: layer_number, epochs_done, mean_cost, pretrain_lr = [0,0,0,pretrain_lr] epoch_flag = 0 epochFlag_fineTuning = 0 iterFlag = 0 W = None b = None ############################################################## ############################################################## datasets = load_data(b_patch_filename,b_groundtruth_filename,b_valid_filename,b_validtruth_filename) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size # numpy random generator # start-snippet-3 numpy_rng = numpy.random.RandomState(89677) print '... building the model' # print 'W: ', W # print 'b: ', b ################################################################ ################CONSTRUCTION OF SdA CLASS####################### sda = SdA( numpy_rng=numpy_rng, n_ins=n_ins, hidden_layers_sizes=hidden_layers_sizes, n_outs=n_outs, W = W, b=b) print 'SdA constructed' ################################################################ ################################################################ if flagValue is 1: ################################################################ # end-snippet-3 start-snippet-4 ######################### # PRETRAINING THE MODEL # ######################### flag = open(prefix+'flag.pkl','wb') cPickle.dump(1,flag, protocol = cPickle.HIGHEST_PROTOCOL) flag.close() print '... getting the pretraining functions' pretraining_fns = sda.pretraining_functions(train_set_x=train_set_x,batch_size=batch_size) print 'Length of pretraining function: ', len(pretraining_fns) print '... pre-training the model' start_time = time.clock() ## Pre-train layer-wise log_pretrain_cost = [] #corruption_levels = [.001, .001, .001] for i in xrange(sda.n_layers): if i < layer_number: i = layer_number #print i # go through pretraining epochs for epoch in xrange(pretraining_epochs): ########################################## if epoch_flag is 1 and epoch < epochs_done_preTraining: epoch = epochs_done_preTraining epoch_flag = 0 ########################################## # go through the training set c = [] for batch_index in xrange(n_train_batches): #sprint batch_index c.append(pretraining_fns[i](index=batch_index, corruption=corruption_levels[i], lr=pretrain_lr)) print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print numpy.mean(c) log_pretrain_cost.append(numpy.mean(c)) save_valid = open(prefix+'pre_training.pkl', 'wb') #print 'YO! i=',i,' epoch=',epoch,' cost=',numpy.mean(c) #print pretrain_lr genVariables = [i, epoch, numpy.mean(c), pretrain_lr] cPickle.dump(genVariables,save_valid,protocol = cPickle.HIGHEST_PROTOCOL) for j in xrange(len(sda.params)): cPickle.dump(sda.params[j].get_value(borrow=True), save_valid, protocol = cPickle.HIGHEST_PROTOCOL) save_valid.close() pretrain_log_file = open(prefix + 'log_pretrain_cost.txt', "a") for l in log_pretrain_cost: pretrain_log_file.write("%f\n"%l) pretrain_log_file.close() #print sda.params[0] end_time = time.clock() print >> sys.stderr, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) # end-snippet-4 ######################## # FINETUNING THE MODEL # ######################## # get the training, validation and testing function for the model datasets = load_data(u_patch_filename,u_groundtruth_filename,u_valid_filename,u_validtruth_filename) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size print '... getting the finetuning functions' train_fn, validate_model, test_model = sda.build_finetune_functions(datasets=datasets,batch_size=100,learning_rate=0.1) print '... finetunning the model' # early-stopping parameters patience = 10 * n_train_batches # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 flag = open(prefix+'flag.pkl','wb') cPickle.dump(2,flag, protocol = cPickle.HIGHEST_PROTOCOL) flag.close() log_valid_cost=[] while (epoch < training_epochs) and (not done_looping): if epochFlag_fineTuning is 1 and epoch < epochs_done_fineTuning: epoch = epochs_done_fineTuning epochFlag_fineTuning = 0 epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_fn(minibatch_index) if iterFlag is 1 and iter < iters_done: iter = iters_done iterFlag = 0 iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = validate_model() this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) log_valid_cost.append(this_validation_loss) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter print 'Saving the best validation network' genVariables = [epoch,best_validation_loss,finetune_lr,patience,iter] save_file = open(prefix+'fine_tuning.pkl','wb') cPickle.dump(hidden_layers_sizes, save_file) cPickle.dump(genVariables, save_file) for j in xrange(len(sda.params)): cPickle.dump(sda.params[j].get_value(borrow=True), save_file, protocol = cPickle.HIGHEST_PROTOCOL) save_file.close() valid_file = open('log_valid_cost.txt', "a") for l in log_valid_cost: valid_file.write("%f\n"%l) log_valid_cost=[] # test it on the test set test_losses = test_model() test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) else : print 'validation loss not decreasing, hence reducing lr' finetune_lr=0.8*finetune_lr if patience <= iter: done_looping = True break end_time = time.clock() print( ( 'Optimization complete with best validation score of %f %%, ' 'on iteration %i, ' 'with test performance %f %%' ) % (best_validation_loss * 100., best_iter + 1, test_score * 100.) ) print >> sys.stderr, ('The training code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet3(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20,50], batch_size=500): ''' layer0: convpool layer layer1: convpool layer layer1: hidden layer layer2: logistic layer ''' datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size index = T.lscalar() x = T.matrix('x') y = T.ivector('y') image_shape = (batch_size, 1, 28, 28) rng = numpy.random.RandomState(1234) print 'building the model ...' layer0_input = x.reshape(image_shape) layer0 = LeNetConvPoolLayer(rng, input = layer0_input, image_shape = image_shape, filter_shape = (nkerns[0], 1, 5, 5), poolsize = (2, 2), activation = relu) layer1 = LeNetConvPoolLayer(rng, input = layer0.output, image_shape = (batch_size, nkerns[0], 12, 12), filter_shape = (nkerns[1], nkerns[0], 5, 5), poolsize = (2,2), activation = relu) layer2_input = layer1.output.flatten(2) layer2 = HiddenLayer(rng, input = layer2_input, n_in = nkerns[1] * 4 * 4, n_out = 500, activation = relu) layer3 = LogisticRegression( input = layer2.output, n_in = 500, n_out = 10) cost = layer3.negative_log_likelihood(y) test_valid_model = theano.function(inputs=[index], outputs=layer3.errors(y), givens = { x: valid_set_x[index * batch_size : (index+1) * batch_size], y: valid_set_y[index * batch_size : (index+1) * batch_size]} ) test_train_model = theano.function(inputs=[index], outputs=layer3.errors(y), givens = { x: train_set_x[index * batch_size : (index+1) * batch_size], y: train_set_y[index * batch_size : (index+1) * batch_size]} ) params = layer3.params + layer2.params + layer1.params + layer0.params gparams = [] for param in params: gparam = T.grad(cost, param) gparams.append(gparam) updates = [] for param, gparam in zip(params, gparams): updates.append((param, param - learning_rate * gparam)) train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens = { x: train_set_x[index * batch_size : (index+1) * batch_size], y: train_set_y[index * batch_size : (index+1) * batch_size]} ) print 'Train the model ...' train_sample_num = train_set_x.get_value(borrow=True).shape[0] valid_sample_num = valid_set_x.get_value(borrow=True).shape[0] epoch = 0 while epoch < n_epochs: epoch += 1 for minibatch_index in xrange(n_train_batches): minibatch_cost = train_model(minibatch_index) print '\tepoch %i, minibatch_index %i/%i, minibatch_cost %f' % (epoch, minibatch_index, n_train_batches, minibatch_cost) train_losses = [test_train_model(i) for i in xrange(n_train_batches)] valid_losses = [test_valid_model(i) for i in xrange(n_valid_batches)] ''' train_score = numpy.sum(train_losses) valid_score = numpy.sum(valid_losses) print 'epoch %i, train_score %f, valid_score %f' % (epoch, float(train_score) / train_sample_num, float(valid_score) / valid_sample_num) ''' train_score = numpy.mean(train_losses) valid_score = numpy.mean(valid_losses) print 'epoch %i, train_score %f, valid_score %f' % (epoch, train_score, valid_score)
def main(): #load data X_train,Y_train,X_valid,Y_valid,X_test=load_data(training_dir,valid_dir,test_dir,labels,sample) #preprocess data by mean subtraction and normalization X_train,X_valid,X_test=preprocess(X_train,X_valid,X_test) #del X_train #del X_test #or load pre-processed data from a previously saved hdf5 file: ''' data=h5py.File('imagenet.transpose.individually.augment.hdf5','r') X_train=np.asarray(data['X_train']) Y_train=np.asarray(data['Y_train']) X_valid=np.asarray(data['X_valid']) Y_valid=np.asarray(data['Y_valid']) X_test=np.asarray(data['X_test']) ''' #print "loaded data from pickle" #OPTIONAL: save loaded/pre-processed data to a pickle to save time in the future #print "saving preprocessed data to hdf5 file" f=h5py.File('imagenet.transpose.individually.augment.contrast.tint.hdf5','w') dset_xtrain=f.create_dataset("X_train",data=X_train) dset_ytrain=f.create_dataset("Y_train",data=Y_train) dset_xvalid=f.create_dataset("X_valid",data=X_valid) dset_yvalid=f.create_dataset("Y_valid",data=Y_valid) dset_xtest=f.create_dataset("X_test",data=X_test) f.flush() f.close() #print "done saving pre-processed data to hdf5 file!" pretrained_model = pretrained('pretrained_model.h5',False) sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True) pretrained_model.compile(optimizer=sgd, loss='categorical_crossentropy',trainLayersIndividually=0) #do some training! print "compilation finished, fitting model" print "pretrained_model.trainLayersIndividually:"+str(pretrained_model.trainLayersIndividually) if pretrained_model.trainLayersIndividually==1: train_epochs=5 else: train_epochs=5 history=pretrained_model.fit(X_train, Y_train, 128,train_epochs,validation_data=tuple([X_valid,Y_valid]),verbose=1,show_accuracy=True) pretrained_model.save_weights("assignment3_weights_nodropout_noregularization_augmenteddata.3epochs.contrast.tint.hdf5",overwrite=True) class_predictions=pretrained_model.predict_classes(X_test) np.savetxt('assignment3_class_predictions_nodropout_noregularization_augmenteddata.3epochs.contrast.tint.txt',class_predictions,fmt='%i',delimiter='\t') train_scores=pretrained_evaluate(pretrained_model,X_train,Y_train) print "pretrained model training scores:"+str(train_scores) valid_scores=pretrained_evaluate(pretrained_model,X_valid,Y_valid) print "pretrained validation scores:"+str(valid_scores) print "writing out the predictions file" predictions=open('assignment3_class_predictions_nodropout_noregularization_augmenteddata.3epochs.contrast.tint.txt','r').read().split('\n') while '' in predictions: predictions.remove('') wnids=open(labels,'r').read().split('\n') while '' in wnids: wnids.remove('') cur_dir=test_dir+"images/" onlyfiles = [f for f in listdir(cur_dir) if isfile(join(cur_dir, f))] entries=10000 outf=open('assignment3_class_predictions_nodropout_noregularization_augmenteddata.formatted.3epochs.contrast.tint.txt','w') for i in range(entries): image_name=onlyfiles[i] predict_index=int(predictions[i]) wnid1=wnids[predict_index] outf.write(image_name+'\t'+str(wnid1)+'\n')
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=20, n_hidden=500): datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size index = T.lscalar() x = T.matrix('x') y = T.ivector('y') rng = numpy.random.RandomState(1234) classifier = MLP(rng=rng, input=x, n_in=28*28, n_hidden=n_hidden, n_out=10) cost = classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr test_valid_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens = { x: valid_set_x[index * batch_size : (index+1) * batch_size], y: valid_set_y[index * batch_size : (index+1) * batch_size]} ) test_train_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens = { x: train_set_x[index * batch_size : (index+1) * batch_size], y: train_set_y[index * batch_size : (index+1) * batch_size]} ) gparams = [] for param in classifier.params: gparam = T.grad(cost, param) gparams.append(gparam) updates = [] for param, gparam in zip(classifier.params, gparams): updates.append((param, param - learning_rate * gparam)) train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens = { x: train_set_x[index * batch_size : (index+1) * batch_size], y: train_set_y[index * batch_size : (index+1) * batch_size]} ) print 'Train the model ...' train_sample_num = train_set_x.get_value(borrow=True).shape[0] valid_sample_num = valid_set_x.get_value(borrow=True).shape[0] epoch = 0 while epoch < n_epochs: epoch += 1 for minibatch_index in xrange(n_train_batches): minibatch_cost = train_model(minibatch_index) train_losses = [test_train_model(i) for i in xrange(n_train_batches)] valid_losses = [test_valid_model(i) for i in xrange(n_valid_batches)] ''' train_score = numpy.sum(train_losses) valid_score = numpy.sum(valid_losses) print 'epoch %i, train_score %f, valid_score %f' % (epoch, float(train_score) / train_sample_num, float(valid_score) / valid_sample_num) ''' train_score = numpy.mean(train_losses) valid_score = numpy.mean(valid_losses) print 'epoch %i, train_score %f, valid_score %f' % (epoch, train_score, valid_score)
"""'Load merged data and do build new features such as types of stores, months with high sales, ...""" train, test = rawProcess() all_data = {"rawtrain": train, "rawtest": test} print "Saving dataset." pickle.dump(all_data, gzip.open("dataMerged00.pickle.gz", "w"), protocol=pickle.HIGHEST_PROTOCOL) if __name__ == "__main__": t0 = time.clock() seed = 2014 rawtrain, rawtest = load_data("dataMerged00.pickle.gz", shuffle_train=seed) rawtrain = rawtrain.set_index("Id") rawtest = rawtest.set_index("Id") rawtrain["predWeekly_Sales"] = 0.0 rawtest["Weekly_Sales"] = 0.0 w = rawtrain["IsHoliday"].values globalDept_Weight_Dict, globalDept_std_Dict = globalDeptWeight(rawtrain) global_dept_month_WeightDict, global_dept_month_stdDict = globalDeptWeight_by_Month(rawtrain) for dept in np.sort(rawtest["Dept"].unique()): finetrain, finetest = fineProcess( rawtrain, rawtest,
method = 'pca' # ['pca', 'lsh', 'itq'] aver_neighbors = 50 # the number of neighbors to obtain the ground true manhattan_hash = True # whether to use the manhattan hashing manhattan_bit = 2 # map each dimension to `manhattan_bit` bits print 'Parameters:' print '==========' print 'database :', db print 'nbits :', nbits print 'method :', method print 'use manhattan hash:', 'Yes' if manhattan_hash else 'No' if manhattan_hash: print 'manhattan bit :', manhattan_bit print [feats, train, test] = load_data(db, f_feats, f_train, f_test); rdm = random.sample(range(len(feats)), len(feats)) # Get test data test_idx = rdm[0:ntest] # ntest x #(dimension of feature), for GIST descriptor, the second dimension # is 512 x_test = [] for idx in test_idx: x_test.append(feats[idx - 1][:]) # Get train data train_idx = rdm[ntest:] x_train = [] for idx in train_idx: x_train.append(feats[idx - 1][:])
def test_mlp( initial_learning_rate, learning_rate_decay, squared_filter_length_limit, n_epochs, batch_size, mom_params, activations, dropout, dropout_rates, layer_sizes, dataset, use_bias, W = None, b = None, random_seed=1234, prefix = ''): """ The dataset is the one from the mlp demo on deeplearning.net. This training function is lifted from there almost exactly. :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ print len(layer_sizes) print len(dropout_rates) assert len(layer_sizes) - 1 == len(dropout_rates) # extract the params for momentum # mom_start = mom_params["start"] # mom_end = mom_params["end"] # mom_epoch_interval = mom_params["interval"] # train_patch = '/media/brain/1A34723D34721BC7/BRATS/varghese/Recon_2013_data/BRATS_training_patches/u_trainpatch_2D_11x11_costpenalty_.npy' # train_label = '/media/brain/1A34723D34721BC7/BRATS/varghese/Recon_2013_data/BRATS_training_patches/u_trainlabel_2D_11x11_costpenalty_.npy' # valid_patch = '/media/brain/1A34723D34721BC7/BRATS/varghese/Recon_2013_data/BRATS_validation_patches/u_validpatch_2D_11x11_costpenalty_.npy' # valid_label = '/media/brain/1A34723D34721BC7/BRATS/varghese/Recon_2013_data/BRATS_validation_patches/u_validlabel_2D_11x11_costpenalty_.npy' train_patch, train_label, valid_patch, valid_label = dataset datasets = load_data(train_patch,train_label,valid_patch,valid_label) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch epoch = T.scalar() x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels learning_rate = T.scalar('lr') # learning_rate = theano.shared(np.asarray(initial_learning_rate, # dtype=theano.config.floatX)) p1 = T.sum(T.eq(train_set_y, 1)).eval() / float(train_set_y.shape[0].eval()) p2 = T.sum(T.eq(train_set_y, 2)).eval() / float(train_set_y.shape[0].eval()) p3 = T.sum(T.eq(train_set_y, 3)).eval() / float(train_set_y.shape[0].eval()) p4 = T.sum(T.eq(train_set_y, 4)).eval() / float(train_set_y.shape[0].eval()) # print 'Probability 1: ',p1 # print 'Probability 2: ',p2 # print 'Probability 3: ',p3 # print 'Probability 4: ',p4 rng = np.random.RandomState(random_seed) # construct the MLP class classifier = MLP(rng=rng, input=x, layer_sizes=layer_sizes, dropout_rates=dropout_rates, activations=activations, W = W, b = b, use_bias=use_bias) print '#############################' print classifier.params print '#############################' # Build the expresson for the cost function. cost = classifier.negative_log_likelihood(y) + 0.0001 * classifier.L2_sqr + 0.0001*classifier.L1_sqr # added today dropout_cost = classifier.dropout_negative_log_likelihood(y) + 0.0001 * classifier.L2_sqr + 0.0001* classifier.L1_sqr # added today # Compile theano function for testing. test_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size]}) #theano.printing.pydotprint(test_model, outfile="test_file.png", # var_with_name_simple=True) # Compile theano function for validation. validate_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size]}) def valid_score(): return [validate_model(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_model(i) for i in xrange(n_test_batches)] def get_prediction(train_set_x, batch_size): prediction = theano.function(inputs = [index], outputs = classifier.pred, givens={x: train_set_x[index * batch_size: (index + 1) * batch_size]}) return prediction #theano.printing.pydotprint(validate_model, outfile="validate_file.png", # var_with_name_simple=True) # Compute gradients of the model wrt parameters # gparams = [] # for param in classifier.params: # # Use the right cost function here to train with or without dropout. # gparam = T.grad(dropout_cost if dropout else cost, param) # gparams.append(gparam) # # ... and allocate mmeory for momentum'd versions of the gradient # gparams_mom = [] # for param in classifier.params: # gparam_mom = theano.shared(np.zeros(param.get_value(borrow=True).shape, # dtype=theano.config.floatX)) # gparams_mom.append(gparam_mom) # # Compute momentum for the current epoch # mom = ifelse(epoch < mom_epoch_interval, # mom_start*(1.0 - epoch/mom_epoch_interval) + mom_end*(epoch/mom_epoch_interval), # mom_end) # # Update the step direction using momentum # updates = OrderedDict() # for gparam_mom, gparam in zip(gparams_mom, gparams): # # Misha Denil's original version # #updates[gparam_mom] = mom * gparam_mom + (1. - mom) * gparam # # change the update rule to match Hinton's dropout paper # updates[gparam_mom] = mom * gparam_mom - (1. - mom) * learning_rate * gparam # # ... and take a step along that direction # for param, gparam_mom in zip(classifier.params, gparams_mom): # # Misha Denil's original version # #stepped_param = param - learning_rate * updates[gparam_mom] # # since we have included learning_rate in gparam_mom, we don't need it # # here # stepped_param = param + updates[gparam_mom] # # This is a silly hack to constrain the norms of the rows of the weight # # matrices. This just checks if there are two dimensions to the # # parameter and constrains it if so... maybe this is a bit silly but it # # should work for now. # if param.get_value(borrow=True).ndim == 2: # #squared_norms = T.sum(stepped_param**2, axis=1).reshape((stepped_param.shape[0],1)) # #scale = T.clip(T.sqrt(squared_filter_length_limit / squared_norms), 0., 1.) # #updates[param] = stepped_param * scale # # constrain the norms of the COLUMNs of the weight, according to # # https://github.com/BVLC/caffe/issues/109 # col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) # desired_norms = T.clip(col_norms, 0, T.sqrt(squared_filter_length_limit)) # scale = desired_norms / (1e-7 + col_norms) # updates[param] = stepped_param * scale # else: # updates[param] = stepped_param updates = sgd(dropout_cost if dropout else cost, classifier.params, learning_rate = learning_rate) # Compile theano function for training. This returns the training cost and # updates the model parameters. output = dropout_cost if dropout else cost train_model = theano.function(inputs=[epoch, index, theano.Param(learning_rate, default=0.1)], outputs=output, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size]}, on_unused_input = 'ignore') #theano.printing.pydotprint(train_model, outfile="train_file.png", # var_with_name_simple=True) # Theano function to decay the learning rate, this is separate from the # training function because we only want to do this once each epoch instead # of after each minibatch. # decay_learning_rate = theano.function(inputs=[], outputs=learning_rate, # updates={learning_rate: learning_rate * learning_rate_decay}) ############### # TRAIN MODEL # ############### print '... training' ########################confusion matrix Block 1########################## prediction = get_prediction(train_set_x,batch_size) y_truth = train_set_y.eval() y_truth = y_truth[0:(len(y_truth)-(len(y_truth)%batch_size))] cnf_freq = 1 cnf_freq_v=5 ################################# prediction_v = get_prediction(valid_set_x,batch_size) y_truth_v = valid_set_y.eval() y_truth_v = y_truth_v[0:(len(y_truth_v)-(len(y_truth_v)%batch_size))] #######Added to see the confusion matrix of the validation data############################# patience = 40 * n_train_batches # look as this many examples regardless patience_increase = 10. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_errors = np.inf best_validation_loss = np.inf best_iter = 0 test_scores = 0. epoch_counter = 0 start_time = time.clock() adapt_counter = 0 log_valid_cost = [] shapeimg = [(42,42),(50,50), (25,40), (50,10)] # results_file = open(results_file_name, 'wb') adaptive_lr = initial_learning_rate while epoch_counter < n_epochs: # Train this epoch epoch_counter = epoch_counter + 1 ################################confusion matrix block 2################# if epoch_counter%cnf_freq==0: pred_c = numpy.array([]) for minibatch_index in xrange(n_train_batches): pred_c = numpy.concatenate([pred_c,numpy.array(prediction(minibatch_index))]) cnf_matrix = confusion_matrix(y_truth, pred_c) print 'Training confusion matrix' print print cnf_matrix print ########################################################################## if epoch_counter%cnf_freq_v==0: pred_v = numpy.array([]) for minibatch_index_v in xrange(n_valid_batches): pred_v = numpy.concatenate([pred_v,numpy.array(prediction(minibatch_index_v))]) cnf_matrix_v = confusion_matrix(y_truth_v, pred_v) print 'validation confusion_matrix' print print cnf_matrix_v print c = [] for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(epoch_counter, minibatch_index, adaptive_lr) c.append(minibatch_avg_cost) ################################################################################### iter = (epoch_counter - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = valid_score() this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch_counter, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) log_valid_cost.append(this_validation_loss) ##############################################Added on 13oct to see confusion matrix of validation data!########################################################################## ########################################################################################################################################### print 'Training cost: ', np.mean(c) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter print 'Saving the best validation network' genVariables = 'gen' if dropout: save_file = open(prefix + 'dropout_fine_tuning.pkl','wb') else: save_file = open(prefix + 'fine_tuning.pkl','wb') cPickle.dump([1000,1000,1000], save_file) cPickle.dump(genVariables, save_file) for j in xrange(len(classifier.params)): cPickle.dump(classifier.params[j].get_value(borrow=True), save_file, protocol = cPickle.HIGHEST_PROTOCOL) save_file.close() # test it on the test set test_losses = test_score() test_scores = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch_counter, minibatch_index + 1, n_train_batches, test_scores * 100.)) else : adapt_counter = adapt_counter+1 # if adapt_counter>20: # adapt_counter=0 # adaptive_lr=0.8*adaptive_lr # print 'Reducing learning rate! ', adaptive_lr adaptive_lr = initial_learning_rate / ( 1 + 0.01 * epoch_counter) if epoch_counter %5 ==0: print 'current learning rate:-',adaptive_lr # adaptive_lr=initial_learning_rate # changed since we are using adadelta!!! # if epoch%1==0: # # prediction1 = prediction() # print prediction1[0] #if patience <= iter: # done_looping = True # break if epoch_counter%10 == 0 and epoch_counter!=0 or epoch_counter == 399 or epoch_counter == 199: for i in xrange(len(classifier.params)/2 - 1): image = Image.fromarray(tile_raster_images( X=classifier.params[2*i].get_value(borrow=True).T, img_shape=shapeimg[i], tile_shape=(40,layer_sizes[i+1]/20), tile_spacing=(1, 1))) image.save(prefix+str(i) + '_' + str(epoch_counter)+'.png') save_file = open(prefix + 'latest_fine_tuning2.pkl','wb') cPickle.dump([1000,1000,1000], save_file) cPickle.dump(genVariables, save_file) for j in xrange(len(classifier.params)): cPickle.dump(classifier.params[j].get_value(borrow=True), save_file, protocol = cPickle.HIGHEST_PROTOCOL) save_file.close() end_time = time.clock() print( ( 'Optimization complete with best validation score of %f %%, ' 'on iteration %i, ' 'with test performance %f %%' ) % (best_validation_loss * 100., best_iter + 1, test_scores * 100.) ) print >> sys.stderr, ('The training code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our ConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = ConvPoolLayer( rng, input=layer0_input, input_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = ConvPoolLayer( rng, input=layer0.output, input_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) print '... loading saved model params' saved_params = joblib.load('model/lenet5_params.pkl') layer0.load_params(saved_params['conv1']) layer1.load_params(saved_params['conv2']) layer2.load_params(saved_params['fc1']) layer3.load_params(saved_params['log1']) # the cost we minimize during training is the NLL of the model # cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) print '... testing' test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print 'test error: ', test_score * 100, '%'
import sys from group import * from load_data import * from write_data import * orig_data = [] group_data = [] rule = [] if len(sys.argv) != 5: print("%d\n" %len(sys.argv)) else: total_class = int(sys.argv[1]) boy_class = int(sys.argv[2]) girl_class = int(sys.argv[3]) filename = sys.argv[4] load_data(filename, orig_data, rule) group_data = grouping(orig_data, total_class, boy_class, girl_class, rule) data = copy.deepcopy(group_data) writefile(data, filename, total_class, boy_class, girl_class)
from keras.layers import containers from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation import numpy as np from scipy import misc import os import load_data encoder = containers.Sequential([Dense(540*420, 270*210), Dense(270*210, 135*105)]) decoder = containers.Sequential([Dense(135*105, 270*210), Dense(270*210, 540*420)]) autoencoder = Sequential() autoencoder.add(AutoEncoder(encoder=encoder, decoder=decoder, output_reconstruction=True)) sgd = SGD(lr=0.1, decay=1e-6, momentum=0.0, nesterov=True) autoencoder.compile(loss='categorical_crossentropy', optimizer=sgd) batch_size = 12 nb_epoch = 20 data, X_test = load_data() X_train, Y_train = data[0][:140,:], data[1][:140,:] X_test, Y_test = data[0][141,:], data[1][:141,:] autoencoder.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch) Y_test = autoencoder.predict_classes(X_test, batch_size=1, verbose=True) Y_test = Y_test.reshape((420,540)) print Y_test.tolist() misc.imsave('nudie.png', Y_test)
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=300): datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size index = T.lscalar() x = T.matrix('x') y = T.ivector('y') step_rate = T.dscalar() classifier = LogisticRegression(input=x, n_in=28*28, n_out=10) cost = classifier.negative_log_likelihood(y) test_valid_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens = { x: valid_set_x[index * batch_size : (index+1) * batch_size], y: valid_set_y[index * batch_size : (index+1) * batch_size]} ) test_train_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens = { x: train_set_x[index * batch_size : (index+1) * batch_size], y: train_set_y[index * batch_size : (index+1) * batch_size]} ) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) updates = [(classifier.W, classifier.W - step_rate * g_W), (classifier.b, classifier.b - step_rate * g_b)] train_model = theano.function(inputs=[index, step_rate], outputs=cost, updates=updates, givens = { x: train_set_x[index * batch_size : (index+1) * batch_size], y: train_set_y[index * batch_size : (index+1) * batch_size]} ) print 'Train the model ...' train_sample_num = train_set_x.get_value(borrow=True).shape[0] valid_sample_num = valid_set_x.get_value(borrow=True).shape[0] epoch = 0 while epoch < n_epochs: epoch += 1 if epoch > 50: learning_rate = 0.1 for minibatch_index in xrange(n_train_batches): minibatch_cost = train_model(minibatch_index, learning_rate) train_losses = [test_train_model(i) for i in xrange(n_train_batches)] valid_losses = [test_valid_model(i) for i in xrange(n_valid_batches)] ''' train_score = numpy.sum(train_losses) valid_score = numpy.sum(valid_losses) print 'epoch %i, train_score %f, valid_score %f' % (epoch, float(train_score) / train_sample_num, float(valid_score) / valid_sample_num) ''' train_score = numpy.mean(train_losses) valid_score = numpy.mean(valid_losses) print 'epoch %i, train_score %f, valid_score %f' % (epoch, train_score, valid_score)
num_estimators = 10 # estimators inside the random forest num_iters = 1000 # num of training iterations num_users = sys.maxint # max num of users(=trajectories) (use sys.maxint for unlimited case) num_users_ratio = 0.7 # % of users(=trajectories) for training demography = False target_action = "Q235" # every trajectory should end with this action # target_action = "Q315" # every trajectory should end with this action feat_path = "../../data/lectures+demography/feats.csv" result_dir = "../results/demo"+("1" if demography else '0')+"-d"+str(discount)+("-un"+str(num_users) if num_users != sys.maxint else "")+("-ur"+str(num_users_ratio) if num_users_ratio != 1 else "")+"-e"+str(num_estimators)+"-i"+str(num_iters)+"-t"+target_action approximator_path = result_dir + "/approximator/random_forest_regressor.model" # path to save the trained approximator debug_action_cnt = False debug_q0 = True # Load data cur_states, actions, rewards, next_states, users, action_index, user_index, valid_feats = load_data(feat_path, target_action, num_users=num_users, num_users_ratio=num_users_ratio, demography=demography) print cur_states.shape, next_states.shape, actions.shape, rewards.shape, users.shape, len(action_index), len(user_index), sum(valid_feats) # dim: cur_states,next_states = num_instances x num_features # dim: actions,rewards = num_instances num_feats = cur_states.shape[1] num_actions = len(action_index) action_list = [ 0 for x in range(num_actions) ] for a,i in action_index.iteritems(): action_list[i] = a s0 = np.zeros((1,num_feats)) approximator = RandomForest(num_estimators=num_estimators, num_actions=num_actions) approximator.train(cur_states, actions, rewards) for iter in range(num_iters): print "---------------------------------------------------------------\nIteration", iter
def evaluate_srcnn(learning_rate=0.1, n_epochs=200, nkerns=[20, 50], batch_size=500): rng = numpy.random.RandomState(123) datasets = load_data() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size index = T.lscalar() x = T.matrix('x') y = T.matrix('y') print '... building the model' layer0_input = x.reshape((batch_size, 1, 800, 600)) layer0 = conv_layer( rng, input=layer0_input, image_shape=(batch_size, 1, 800, 600), filter_shape=(nkerns[0], 1, 121, 1) ) layer1 = conv_layer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 680, 600), filter_shape=(nkerns[1], nkerns[0], 1, 121) ) layer2 = conv_layer( rng, input=layer0.output, image_shape=(batch_size, nkerns[1], 680, 480), filter_shape=(nkerns[1], nkerns[1], 16, 16) ) layer3 = conv_layer( rng, input=layer0.output, image_shape=(batch_size, nkerns[2], 665, 465), filter_shape=(nkerns[1], nkerns[2], 8, 8) ) cost = sum((layer3.output - layer0_input)^2) test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) params = layer3.params + layer2.params + layer1.params + layer0.params grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) print '... training' patience = 10000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
from load_data import * import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.preprocessing import scale data = load_data() # data = scale(raw) pca = PCA(n_components=9) pca.fit(data) print type(pca.explained_variance_ratio_) cumulative_evr = np.zeros(8) cumulative_evr[0] = pca.explained_variance_ratio_[0] for i in range(1, 8): cumulative_evr[i] = cumulative_evr[i - 1] + pca.explained_variance_ratio_[i] plt.figure(1) index = np.arange(9) + 1 plt.bar(index, pca.explained_variance_ratio_, 0.35, color="g") plt.ylim(0, 1) plt.xlim(1, 9) plt.title("Bar Plot for tge Explained Variance Ratio of Each Dimension") plt.xlabel("dimension") plt.ylabel("EVR") plt.show()
def test_SdA(finetune_lr=0.1, pretraining_epochs=1, pretrain_lr=0.001, training_epochs=1, b_patch_filename = 'b_Training_patches_norm.npy', b_groundtruth_filename = 'b_Training_labels_norm.npy', b_valid_filename = 'b_Validation_patches_norm.npy', b_validtruth_filename = 'b_Validation_labels_norm.npy', u_patch_filename = 'u_Training_patches_norm.npy', u_groundtruth_filename = 'u_Training_labels_norm.npy', u_valid_filename = 'u_Validation_patches_norm.npy', u_validtruth_filename = 'u_Validation_labels_norm.npy', batch_size=100, n_ins = 605, n_outs = 5, hidden_layers_sizes = [1000,1000,1000],prefix = '11_11_3_G4_', corruption_levels=[0.2,0.2,0.2], resumeTraining = False, StopAtPretraining = False): """ Demonstrates how to train and test a stochastic denoising autoencoder. This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used in the finetune stage (factor for the stochastic gradient) :type pretraining_epochs: int :param pretraining_epochs: number of epoch to do pretraining :type pretrain_lr: float :param pretrain_lr: learning rate to be used during pre-training :type n_iter: int :param n_iter: maximal number of iterations to run the optimizer :type dataset: string :param dataset: path the the pickled dataset """ print '###########################' print 'Pretraining epochs: ', pretraining_epochs print 'Finetuning epochs: ', training_epochs print '###########################' W = [] b = [] ######################################################### ######################################################### #@@@@@@@@ Needs to be worked on @@@@@@@@@@@@@@@@@ # Snippet to resume training if the program crashes halfway through # opts, arg = getopt.getopt(sys.argv[1:],"rp:") for opt, arg in opts: if opt == '-r': resumeTraining = True # make this true to resume training from saved model elif opt == '-p': prefix = arg flag = 0 if(resumeTraining): flag = 1 path = '/media/brain/1A34723D34721BC7/BRATS/codes/results/test_255_9x9x3/9x9x3pre_training.pkl' savedModel_preTraining = file(path,'rb') genVariables_preTraining = cPickle.load(savedModel_preTraining) layer_number, epochs_done_preTraining, mean_cost , pretrain_lr = genVariables_preTraining epoch_flag = 1 print 'Inside resumeTraining!!!!!!!!!!!!!!!!!!' no_of_layers = len(hidden_layers_sizes) + 1 for i in xrange(no_of_layers): W.append(cPickle.load(savedModel_preTraining)) b.append(cPickle.load(savedModel_preTraining)) ############################################################## ############################################################## if flag == 0: datasets = load_data(b_patch_filename,b_groundtruth_filename,b_valid_filename,b_validtruth_filename) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size # numpy random generator # start-snippet-3 numpy_rng = numpy.random.RandomState(89677) print '... building the model' # print 'W: ', W # print 'b: ', b ################################################################ ################CONSTRUCTION OF SdA CLASS####################### sda = SdA( numpy_rng=numpy_rng, n_ins=n_ins, hidden_layers_sizes=hidden_layers_sizes, n_outs=n_outs) print 'SdA constructed' ################################################################ ################################################################ ################################################################ # end-snippet-3 start-snippet-4 ######################### # PRETRAINING THE MODEL # ######################### flag = open(prefix+'flag.pkl','wb') cPickle.dump(1,flag, protocol = cPickle.HIGHEST_PROTOCOL) flag.close() print '... getting the pretraining functions' pretraining_fns = sda.pretraining_functions(train_set_x=train_set_x,batch_size=batch_size) print 'Length of pretraining function: ', len(pretraining_fns) print '... pre-training the model' start_time = time.clock() ## Pre-train layer-wise log_pretrain_cost = [] shapeimg = [(33,44),(50,60), (25,40), (50,10)] #corruption_levels = [.001, .001, .001] for i in xrange(sda.n_layers): # if i < layer_number: # i = layer_number #print i # go through pretraining epochs best_cost = numpy.inf adapt_counter = 0 learning_rate = pretrain_lr if i==0: num_of_epochs = pretraining_epochs else: num_of_epochs = pretraining_epochs for epoch in xrange(num_of_epochs): ########################################## # if epoch_flag is 1 and epoch < epochs_done_preTraining: # epoch = epochs_done_preTraining # epoch_flag = 0 ########################################## # go through the training set c = [] for batch_index in xrange(n_train_batches): #sprint batch_index c.append(pretraining_fns[i](index=batch_index, corruption=corruption_levels[i], lr=learning_rate)) print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch), print numpy.mean(c) current_cost = numpy.mean(c) log_pretrain_cost.append(numpy.mean(c)) if current_cost < best_cost: best_cost = current_cost if current_cost > best_cost : adapt_counter = adapt_counter+1 # if adapt_counter>25: itr = epoch + 1 learning_rate = learning_rate / ( 1 + itr * 5e-05) # print 'Reducing learning rate', learning_rate adapt_counter = 0 previous_cost = current_cost if epoch%50 == 0 and epoch!=0 or epoch == 399 or epoch == 199: image = Image.fromarray(tile_raster_images( X=sda.params[2*i].get_value(borrow=True).T, img_shape=shapeimg[i], tile_shape=(40,hidden_layers_sizes[i]/20), tile_spacing=(1, 1))) image.save(prefix+str(i) + '_' + str(epoch)+'.png') save_valid = open(prefix+'pre_training.pkl', 'wb') genVariables = ['gen'] cPickle.dump(genVariables,save_valid,protocol = cPickle.HIGHEST_PROTOCOL) for j in xrange(len(sda.params)): cPickle.dump(sda.params[j].get_value(borrow=True), save_valid, protocol = cPickle.HIGHEST_PROTOCOL) save_valid.close() pretrain_log_file = open(prefix + 'log_pretrain_cost.txt', "a") for l in log_pretrain_cost: pretrain_log_file.write("%f\n"%l) pretrain_log_file.close() # for k in [0,2,4,6]: # print k # image = Image.fromarray(tile_raster_images( # X=sda.params[k].get_value(borrow=True).T, # img_shape=shapeimg[k/2], tile_shape=(40,hidden_layers_sizes[k/2]/20), # tile_spacing=(1, 1))) # image.save(prefix+str(k/2)+'.png') #print sda.params[0] end_time = time.clock() print >> sys.stderr, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print '###################' # end-snippet-4 ######################## # FINETUNING THE MODEL # ######################## # get the training, validation and testing function for the model if flag == 1: datasets = load_data(u_patch_filename,u_groundtruth_filename,u_valid_filename,u_validtruth_filename) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size numpy_rng = numpy.random.RandomState(89677) print '... building the model' # print 'W: ', W # print 'b: ', b ################################################################ ################CONSTRUCTION OF SdA CLASS####################### sda = SdA( numpy_rng=numpy_rng, n_ins=n_ins, hidden_layers_sizes=hidden_layers_sizes, n_outs=n_outs, W = W, b = b) print 'SdA constructed' if StopAtPretraining == False: print '... getting the finetuning functions' train_fn, validate_model, test_model = sda.build_finetune_functions(datasets=datasets,batch_size=batch_size) print batch_size print '... finetunning the model' ########################confusion matrix Block 1########################## prediction = sda.get_prediction(train_set_x,batch_size) y_truth = np.load(u_groundtruth_filename) y_truth = y_truth[0:(len(y_truth)-(len(y_truth)%batch_size))] cnf_freq = 1 ################################################################## # early-stopping parameters patience = 40 * n_train_batches # look as this many examples regardless patience_increase = 10. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() finetune_lr_initial = finetune_lr done_looping = False epoch = 0 flag = open(prefix+'flag.pkl','wb') cPickle.dump(2,flag, protocol = cPickle.HIGHEST_PROTOCOL) flag.close() log_valid_cost=[] adapt_counter = 0 while (epoch < training_epochs) and (not done_looping): # if epochFlag_fineTuning is 1 and epoch < epochs_done_fineTuning: # epoch = epochs_done_fineTuning # epochFlag_fineTuning = 0 epoch = epoch + 1 ################################confusion matrix block 2################# if epoch%cnf_freq==0: pred_c = np.array([]) for minibatch_index in xrange(n_train_batches): pred_c = np.concatenate([pred_c,np.array(prediction(minibatch_index))]) cnf_matrix = confusion_matrix(y_truth, pred_c) print cnf_matrix ########################################################################## c = [] for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_fn(index=minibatch_index,lr=finetune_lr) c.append(minibatch_avg_cost) # if iterFlag is 1 and iter < iters_done: # iter = iters_done # iterFlag = 0 iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = validate_model() this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) log_valid_cost.append(this_validation_loss) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter print 'Saving the best validation network' genVariables = [epoch,best_validation_loss,finetune_lr,patience,iter] save_file = open(prefix+'fine_tuning.pkl','wb') cPickle.dump(hidden_layers_sizes, save_file) cPickle.dump(genVariables, save_file) for j in xrange(len(sda.params)): cPickle.dump(sda.params[j].get_value(borrow=True), save_file, protocol = cPickle.HIGHEST_PROTOCOL) save_file.close() # test it on the test set test_losses = test_model() test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) print 'Training cost: ', np.mean(c) else: adapt_counter = adapt_counter+1 if adapt_counter>20: adapt_counter=0 finetune_lr = 0.8*finetune_lr print 'Reduced learning rate : ', finetune_lr else: finetune_lr = finetune_lr_initial / (1 + epoch * 5e-05) #if patience <= iter: # done_looping = True # break end_time = time.clock() print( ( 'Optimization complete with best validation score of %f %%, ' 'on iteration %i, ' 'with test performance %f %%' ) % (best_validation_loss * 100., best_iter + 1, test_score * 100.) ) print >> sys.stderr, ('The training code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) valid_file = open(prefix+'log_valid_error.txt', 'w') valid_file.write('Best validation error: '+str(best_validation_loss*100)) valid_file.write('\nBest test error: '+str(test_score*100)) valid_file.close() finetune_log_file = open(prefix + 'log_finetune_cost.txt', "a") for l in log_valid_cost: finetune_log_file.write("%f\n"%l) finetune_log_file.close()