def compute_covariance(self, x, y, diag=False): x, y = format_data(x), format_data(y) covs = [] for kernel, idx in zip(self.kernels, self.maps): covs.append(kernel.compute_covariance(x[:,idx], y[:,idx], diag=diag)) return np.prod(covs, axis=0)
def _format_vects(self, x, y, diag=False): x, y = format_data(x), format_data(y) if not diag: idx1, idx2 = np.meshgrid(np.arange(x.shape[0]), np.arange(y.shape[0])) x, y = x[idx1], y[idx2] else: x, y = x.reshape(x.shape + (1,)).swapaxes(1,2), x.reshape(x.shape + (1,)).swapaxes(1,2) return x, y
def add_observation(self, X_new, Y_new): X_new, Y_new = format_data(X_new, dim=self._dim), format_data(Y_new, dim=1) if np.sqrt(np.sum((self.X - X_new)**2, axis=1)).min() > self.tol: self.X = np.concatenate((self.X, X_new), axis=0) self.Y = np.concatenate((self.Y, Y_new), axis=0) self._nu = npr.randn(self.X.shape[0], 1) if self.marginalize and self.sampler is not None: _ = self.sampler.sample(self.resample) # re-sample self._recompute = True
def job_details(self, job_name): print( blue('Getting details for job %s on server %s ...' % (job_name, self.url))) table_data = [['Name', 'Status', 'Url']] try: job = self.server.get_job(job_name) table_data.append([job.name, self.job_status(job), job.url]) format_data(table_data) except custom_exceptions.UnknownJob: print('No job found : %s' % job_name)
def job_list(self): print(blue('Fetching job list for %s...' % self.url)) table_data = [['Name', 'Status', 'Url']] for job_name, job_instance in self.server.get_jobs(): table_data.append([ job_instance.name, green('RUNNING') if job_instance.is_running() else blue('STOPPED'), job_instance.url ]) format_data(table_data) print("Jobs found: ", len(self.server.get_jobs_list()))
def compute_covariance(self, x, y, diag=False): x, y = format_data(x), format_data(y) x, y = np.copy(x), np.copy(y) assert(x.shape[1] == self.N and y.shape[1] == self.N) for i in range(x.shape[1]): a, b = self.parameters[-2*self.N + 2*i].value, self.parameters[-2*self.N + 2*i + 1].value x[:,i] = beta.cdf(x[:,i], a, b) y[:,i] = beta.cdf(y[:,i], a, b) val = self.kernel.compute_covariance(x, y, diag=diag) return val
def c_repo_list (self, a, username, **kw) : if kw.get("admin") : # show all repository _repos = self._config_db.repositories else : _repos = self._config_db.get_user_property(username, "repository", list(), ) _values = map( lambda x : ( "%s" % ( self._config_db.get_repository_property(x, "path"), ), "%s%s" % ( x, self._config_db.get_repository_property(x, "description", "").strip() and ( " (%s)" % self._config_db.get_repository_property(x, "description", "").strip() ) or "", ), self._config_db.is_remote_repository(x) and " O" or " X", ), _repos, ) #_values.sort() return utils.format_data( _values, width=self._window_size[1], captions=("path", "alias", "is remote?", ), num_columns=3, )
def job_list_active(self): print(blue('Fetching job list for %s...' % self.url)) table_data = [['Name', 'Status', 'Url']] count = 0 for job_name, job_instance in self.server.get_jobs(): if not job_instance.is_enabled(): continue count += 1 if count >= 10: break table_data.append([ job_instance.name, self.job_status(job_instance), job_instance.url ]) format_data(table_data) print("Jobs found: ", len(self.server.get_jobs_list()))
def setUp(self): self.ud_obj = dict( user_id=1, activity_id=1, temp=(1, 20, 30), wind=(0, 0, 10), cloud=(0, 0, 100), rain=(0.0, 0.0, 0.5), weights=dict( wind=1.0, rain=0.9, temp=0.65, cloud=0.1, ), min_size=1, ) time_ranges = [] for i in range(5): time_ranges.append((i * 24 + 6, i * 24 + 10)) for i in range(5, 7): time_ranges.append((i * 24 + 8, i * 24 + 17)) print(time_ranges) self.ud_obj['time_ranges'] = time_ranges with open('tests/data/data.json') as fh: self.weather_data_bc = BCMock(format_data(json.loads(fh.read())))
def _gp_posterior(self, x): ''' Using Algorithm 2.1 from Gaussian Processes for Machine Learning. Returns: mean, variance ''' X, Y, noise, kernel = self.X, self.Y, self.noise, self.kernel if self._has_noise_prior: noise = noise.value x = format_data(x, dim=self._dim) if self._recompute: self._compute_aux() L, lower, alpha, ll = self._L, self._lower, self._alpha, self._ll K_x = kernel.compute_covariance(x, X) v = linalg.solve_triangular(L, K_x, lower=lower) mean = K_x.transpose().dot(alpha) var = kernel.compute_covariance(x, x, diag=True)[:, 0] - (v * v).sum( axis=0) + noise # compute ONLY the diagonal return mean[:, 0], var
def city_weather_forecast(api_key, city_name): try: URL = WEATHER_URL + 'forecast?' + "appid=" + api_key + "&q=" + city_name get_data = requests.get(URL, headers=HEADERS, timeout=10).json() return format_data( json.loads(json.dumps(get_data, default=_json_encode))) except requests.exceptions.Timeout: return abort(408, {"status": False, "message": "Request timeout"})
def main(sc): file_key = "7beac85e-00a5-48ae-af2a-aaf9a332463b" weather_data = utils.format_data(utils.get_s3_json_file(file_key)) weather_data_bc = sc.broadcast(weather_data) user_data_rdd = sc.parallelize(user_data) results = user_data_rdd.map( lambda ud_obj: utils.map_func(weather_data_bc, ud_obj)).collect() for result in results: print("User: %s" % (result['user_id'], )) print(result['wws'])
def c_user_view (self, a, username, **kw) : _values = map( lambda x : (x, self._config_db.get_user_property(username, x, "", ), ), ("realname", "email", "public_key", "admin", ), ) _values.insert(0, ( "has password?", bool(self._config_db.get_user_property(username, "password")) and "yes" or "no", ), ) _values.insert(0, ("username", username, ), ) _respositories = self._config_db.get_user_property(username, "repository", ) _values.append(("repository", _respositories and ", ".join(_respositories) or "", ), ) return utils.format_data(_values, width=self._window_size[1], captions=("key", "value", ), )
def print_user_list (self, userlist, ) : _values = map( lambda x : ( x, self._config_db.get_full_username(x), self._config_db.is_admin(x) and "O" or "X", ), userlist, ) return [ i for i in utils.format_data( _values and _values or (("no users", "", ), ), width=self._window_size[1], captions=("username", "realname", "is admin?", ), ) ]
def gen_data_page(self, pattern, is_kernel=0): self.data_page_str.clear() # TO DO: need to embed num_of_kernel_data_pages, num_of_data_pages, etc. in the riscv_core_setting page_cnt = 1 if is_kernel else 2 page_size = 4096 for section_idx in range(page_cnt): if is_kernel: self.data_page_str.append( "kernel_data_page_{}:".format(section_idx)) else: self.data_page_str.append("data_page_{}:".format(section_idx)) # TO DO: need to embed data_page_alignment in the core_setting self.data_page_str.append(".align 12") for i in range(0, page_size, 32): tmp_data = self.gen_data(i, pattern, 32) tmp_str = ".word {:{}}".format(utils.format_data(tmp_data), utils.length) self.data_page_str.append(tmp_str)
def regress(self, x, num_samples=10, marginalize=True): x = format_data(x, dim=self._dim) if self.marginalize and marginalize: means = np.zeros((x.shape[0], num_samples)) vars = np.zeros((x.shape[0], num_samples)) hp_samples = self.sampler.sample(num_samples) for i in range(num_samples): self.set_kernel_parameters(hp_samples[i]) mean, var = self._gp_posterior(x) means[:, i] = mean vars[:, i] = var mean = means.mean(axis=1) var = vars.mean(axis=1) # var = mean**2 - (means**2 + vars).mean(axis=1) else: mean, var = self._gp_posterior(x) return mean, var
def main(config, data): X, y = utils.format_data(config, data) print('| Creating log configurations ...') config.create_log_configurations() print('| Saving log configurations ...') config.comments = f'{config.comments}, {str(X.shape)}' config.save_config() folds = config.data['folds'] current_best_acc, current_best_mcc = 0, -2 best_fold = None print(f'| Training model with {folds} folds ...') folder = KFold(n_splits=folds) exp_evaluation = experiment_history.EvaluationHistory(config=config) exp_training = experiment_history.TrainingHistory(log_dir=config.log_dir) for k, (train_index, test_index) in enumerate(folder.split(X)): print(f'| X: {X.shape}\n| y: {y.shape}') x_train, x_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # Build model model_handler = model.Model(config=config) md = model_handler.build_model(x_train.shape) start, used = 0, 0 try: # Train model start = time.time() history = model_handler.train_model(x_train, y_train) used = time.time() - start print(f'| K = {k + 1} | Used {used:.2f} seconds') except KeyboardInterrupt as e: break # Evaluate model print('| Evaluating model on test set ...') predictions = md.predict(x_test) is_football = config.usecase == config.FOOTBALL evaluations = None if is_football: evaluations = exp_evaluation.custom_evaluate(predictions, y_test, k + 1, data['columns']) else: evaluations = exp_evaluation.evaluate(predictions, y_test, k + 1) exp_training.update_history(history.history, used) # Save model print('| Saving model ...') md.save(f'{config.log_dir}/models/k{k + 1}.h5') # Replace current best model if ACC is better (or ACC is similar but MCC is better) same_acc = abs(evaluations['ACC'] - current_best_acc) < 0.05 better_mcc = evaluations['MCC'] > current_best_mcc better_acc = evaluations['ACC'] > current_best_acc if better_acc or (same_acc and better_mcc): print('| Replacing best model ...') md.save(f'{config.log_dir}/models/best.h5') current_best_mcc = evaluations['MCC'] current_best_acc = evaluations['ACC'] best_fold = k + 1 # Clear model/session del md del model_handler K.clear_session() print('=======================') # Rename best model for best fold and save evaluations if os.path.exists(f'{config.log_dir}/models/best.h5'): os.rename(f'{config.log_dir}/models/best.h5', f'{config.log_dir}/models/best_({best_fold}).h5') if best_fold or current_best_mcc != -2: exp_evaluation.save_history() exp_evaluation.save_statistics() exp_training.save_history() exp_training.save_statistics() print(f'| Saved all in: {config.log_dir}') else: print('| Did not save anything')
if a != pred: incorrect += 1 activity_error[a] += 1; for a in test_activites: print "Activity " + str(a) + ": " + str(float(activity_error[a])/activity_count[a]); utils.show_confusion_matrix(y_test, y_pred) return incorrect/total if __name__ == "__main__": X, y = data.load_time() activities = utils.format_data(X, y) train, test = utils.split_training(activities, .2) hmms = train_all_hmm(train) error = test_error(test, hmms) print error #for a in activities: # print "Activity " + str(a); # for array in activities[a]: # print np.shape(array); # # X = [np.array(x) for x in activities[2]] # print len(X), X[:2]
import csv import numpy as np import utils import plot DATASETS = ["LGA", "SFO", "MDW", "ORD"] for DATASET in DATASETS: with open("data/" + DATASET + ".csv", 'rb') as rawcsv: # np.set_printoptions(threshold=np.inf, suppress=True) data_orig = csv.reader(rawcsv, delimiter=',') data, dates, wind = utils.format_data(data_orig) date_vector = dates[:, 0] date_vector *= 12.0 date_vector += 1 print(data.shape) print(date_vector.shape) plot.plot2d((date_vector, data[:, 1]), DATASET + "-temp-time", c="black", a=0.2) plot.plot2d((date_vector, data[:, 7]), DATASET + "-humidity-time", c="black", a=0.2) plot.plot2d((date_vector, wind),
def trainNestedCV(direct, subject, session, filename, hyp_params, parameters): subj = load_subject(direct, subject, 1, filename)["subject"] # # data = subj.data3D.astype(np.float32) # convert data to 3d for deep learning # labels = subj.labels.astype(np.int64) # labels[:] = [x - 1 for x in labels] data, labels = format_data('words', subject, 4096) import random #just for testing labels = [] #just for testing for i in range(200): #just for testing labels.append(random.randint(0, 3)) #just for testing labels = np.array(labels).astype(np.int64) data = data[:200, :, 0:750] unique = np.unique(labels, return_counts=False) data_params = dict(n_classes=len(unique), n_chans=6, input_time_length=subj.epoch) #n_chans = subj.n_chans #w = windows(data, subj, 500, 250, 500) # fs = subj.sfreq # list of windows num_folds = 2 skf = StratifiedKFold( n_splits=num_folds, shuffle=False, random_state=10) # don't randomize trials to preserce structure trainsetlist, testsetlist = [], [] inner_fold_acc, inner_fold_loss, inner_fold_CE = [], [], [] subj_results = Results( subject, filename, num_folds) #, class_names=["apple", "orange", "car", "bus"] subj_results.change_directory(direct) subj_results.get_acc_loss_df( hyp_params, 'Fold') # empty dataframe headed with each HP set clf = Classification(hyp_params, parameters, data_params, "01", "shallow", "words") # classifier object print(f"Inner-fold training for Subject {subject} in progress...") for inner_ind, outer_index in skf.split(data, labels): inner_fold, outer_fold = data[inner_ind], data[outer_index] inner_labels, outer_labels = labels[inner_ind], labels[outer_index] subj_results.concat_y_true(outer_labels) trainsetlist.append(SignalAndTarget( inner_fold, inner_labels)) # used for outer-fold train/test testsetlist.append(SignalAndTarget(outer_fold, outer_labels)) for train_idx, valid_idx in skf.split(inner_fold, inner_labels): X_Train, X_val = inner_fold[train_idx], inner_fold[valid_idx] y_train, y_val = inner_labels[train_idx], inner_labels[valid_idx] train_set = SignalAndTarget(X_Train, y_train) val_set = SignalAndTarget(X_val, y_val) hyp_param_acc, hyp_param_loss = [], [] hyp_param_acc, hyp_param_loss, hyp_param_CE = clf.train_inner( train_set, val_set, None, False) inner_fold_loss.append(hyp_param_loss) inner_fold_acc.append(hyp_param_acc) inner_fold_CE.append(hyp_param_CE) subj_results.fill_acc_loss_df(inner_fold_acc, inner_fold_loss, inner_fold_CE) subj_results.get_hp_means( hyp_params, "accuracy") #needed to select inter-subject parameters subj_results.get_best_params("accuracy") clf.best_params = subj_results.best_params clf.set_best_params() print(f"Best parameters selected: {clf.best_params}") print( "///////-------------------------------------------------------///////" ) print( f"Outer-fold training and testing for Subject {subject} in progress..." ) scores, fold_models, predictions, probabilities, outer_cross_entropy = clf.train_outer( trainsetlist, testsetlist, False ) #accuracy score for each fold, combined predictions for each fold subj_results.outer_fold_accuracies = scores subj_results.y_pred = np.array(predictions) subj_results.y_probs = np.array(probabilities) subj_results.outer_fold_cross_entropies = outer_cross_entropy subj_results.train_loss, subj_results.valid_loss, subj_results.test_loss, subj_results.train_acc, subj_results.valid_acc, subj_results.test_acc = get_model_loss_and_acc( fold_models) subj_results.save_result() subj_results.subject_stats() print("") print(subj_results.subject_stats_df.head())
model_path = config["Model"]["Path"] print("All configuration is imported") cnxn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server + ';PORT=1433;DATABASE=' + database + ';UID=' + username + ';PWD=' + password) print("Connection to SQL Server is established") # Load SQL Table into Dataframe # image_df = utils.load_image_table(cnxn) df = utils.load_image_table(cnxn) print("Photo Data is loaded") # Change Column Name and Data Type df = utils.format_data(df) print("Photo Data is formatted") # Extract insights from existing columns, e.g. weekday from unix epoch df = utils.extract_data(df) print("More insight is gained from Photo Data") # Predict Score # Apply delay function on Image Post Time and Image Original User Last Post Time # best_photos_sorted features = [ "caption_length", "english_content_length", "english_content_ratio",
NUM_TRAIN = 20000 np.set_printoptions(formatter={'float': '{:05.2f}'.format}) # This file does the testing without clearly time-dependent variables, # such as wind direction, temperature, dewpoint, etc. # # It would thus not make sense to /try/ to predict temperature, etc. # from our HMM classes (since we are ignoring them) warnings.filterwarnings("ignore") for DATASET in DATASETS: with open("data/" + DATASET + ".csv", 'rb') as rawcsv: our_csv = csv.reader(rawcsv, delimiter=',') data = utils.format_data(our_csv)[0] orig_data = data print("\n##### " + DATASET + " #####") print(data.shape) print # overall standard deviations of data std = np.std(data, axis=0) # HMM class estimates train = data[0:NUM_TRAIN].astype(int) test = data[NUM_TRAIN:].astype(int) # naive weather prediction: tomorrow has the same weather as today deltas = np.zeros((len(test)-1, 21))
def objective(trial): # Open data file f_in = h5py.File(DT_FL_IN, "r") dt_in = f_in[DT_DST_IN] f_out = h5py.File(DT_FL_OUT, "r") dt_out = f_out[DT_DST_OUT] WD = 2 # Dummy y_data x_data, _ = format_data(dt_in, wd=WD, get_y=True) _, y_data = format_data(dt_out, wd=WD, get_y=True) x_data = np.squeeze(x_data) # Split data and get slices idxs = split(x_data.shape[0], N_TRAIN, N_VALID, test_last=dt_in.attrs["idx"]) slc_trn, slc_vld, slc_tst = slicer(x_data.shape, idxs) # Get data x_train = x_data[slc_trn[0]] y_train = y_data[slc_trn[0]] x_val = x_data[slc_vld[0]] y_val = y_data[slc_vld[0]] conv_shape = y_train.shape[1:3] # Strides cfg strd = [2, 2, 5, 5] # Limits and options epochs = 60 # Filters flt_lm = [[4, 128], [4, 128], [4, 128]] d_lm = [1, 50] # Kernel k_lm = [3, 5] # Regularizer l2_lm = [1e-7, 1e-3] # Activation functions act_opts = ["relu", "elu", "tanh", "linear"] # Latent space cfg lt_sz = [5, 150] lt_dv = [0.3, 0.7] # Learning rate lm_lr = [1e-5, 1e-1] # Clear tensorflow session tf.keras.backend.clear_session() # Input inputs = layers.Input(shape=x_train.shape[1:]) d = inputs # Decoder n_layers = trial.suggest_int("n_layers", 1, 3) flt = trial.suggest_int("nl_flt", d_lm[0], d_lm[1]) # Reduction from output red = np.prod(strd[:n_layers]) # Decoder first shape lt_shp = (np.array(conv_shape) / red).astype(int) # Decoder dense size n_flat = np.prod(lt_shp) * flt # Format stride list strd = strd[::-1][-n_layers:] # Latent -> Decoder layer # Activation act_lt = trial.suggest_categorical("lt_activation", act_opts) # Regularization l2_lt = int(trial.suggest_loguniform("lt_l2", l2_lm[0], l2_lm[1])) l2_reg = regularizers.l2(l=l2_lt) # Flat input to the decoder d = layers.Dense(n_flat, activation=act_lt, kernel_regularizer=l2_reg, name="l1_dense_decoder")(inputs) # Reshape to the output of the encoder d = layers.Reshape(list(lt_shp) + [flt])(d) # Generate the convolutional layers for i in range(n_layers): # Get number of filters flt = trial.suggest_int("n{}_flt".format(i), flt_lm[i][0], flt_lm[i][1]) # Get the kernel size k_sz = trial.suggest_categorical("d{}_kernel_size".format(i), k_lm) # Get the activation function act = trial.suggest_categorical("d{}_activation".format(i), act_opts) # Regularization value l2 = trial.suggest_loguniform("d{}_l2".format(i), l2_lm[0], l2_lm[1]) l2_reg = regularizers.l2(l=l2) # Convolutional layer d = layers.Conv2DTranspose( flt, (k_sz, k_sz), strides=strd[i], activation=act, padding="same", kernel_regularizer=l2_reg, name="{}_decoder".format(i + 1), )(d) dp = 0 # Dropout layers if dp > 0: d = layers.Dropout(dp, name="{}_dropout_decoder".format(i + 1))(d) decoded = layers.Conv2DTranspose( y_train.shape[3], (5, 5), activation="linear", padding="same", name="output_decoder", )(d) ae = Model(inputs, decoded, name="Decoder_nxt") # Earling stopping monitoring the loss of the validation dataset monitor = "val_loss_norm_error" patience = int(epochs * 0.3) es = EarlyStopping(monitor=monitor, mode="min", patience=patience, restore_best_weights=True) opt = "adam" if opt == "adam": k_optf = optimizers.Adam elif opt == "nadam": k_optf = optimizers.Nadam elif opt == "adamax": k_optf = optimizers.Adamax lr = trial.suggest_loguniform("lr", lm_lr[0], lm_lr[1]) if lr > 0: k_opt = k_optf(learning_rate=lr) else: k_opt = k_optf() ae.compile(optimizer=k_opt, loss=loss_norm_error, metrics=["mse", loss_norm_error]) batch_size = int(trial.suggest_uniform("batch_sz", 2, 32)) ae.summary() hist = ae.fit( x_train, y_train, epochs=epochs, batch_size=batch_size, shuffle=True, validation_data=(x_val, y_val), callbacks=[KerasPruningCallback(trial, "val_loss_norm_error"), es], verbose=1, ) txt = PREFIX + SUFFIX ae.save(txt.format(RUN_VERSION, trial.number)) return min(hist.history["val_loss_norm_error"])
prediction = result.mean(axis=0) uncertainty = result.std(axis=0) return prediction, uncertainty # Load Data print("Loading and preprocessing Data...\n") clean_data = load_preprocess_data() # Preprocess Data X = clean_data.drop('state', axis=1) y = pd.DataFrame(clean_data['state'].values) y.columns = ['state'] X = X.iloc[:500000] y = y.iloc[:500000] X = format_data(X) del clean_data # Extract categories cats = extract_categories(y['state'].values) cats.sort() print(type(cats)) NUM_CATS = len(cats) print("categories: ", cats) print("number of categories: ", NUM_CATS) # Scale Data min_max_scaler = MinMaxScaler() nsamples, nx, ny = X.shape d2_X = X.reshape((nsamples,nx*ny)) d2_X_scaled = min_max_scaler.fit_transform(d2_X)
dtype=bool) yvl_missing = np.array(validate_df.loc[:, 'COVAR_y1_MISSING':'COVAR_y3_MISSING'], dtype=bool) # read data train_df['train_flag'] = True validate_df['train_flag'] = False data = pd.concat((train_df, validate_df)) # remove temporary data del train_df del validate_df # basic formatting Xtr, ytr, Xvl, yvl = utils.format_data(data, preprocessing=False) del data # # do preprocessing # scaler = decomposition.RandomizedPCA() #scaler = decomposition.SparsePCA(n_components=max_pca_components) #scaler = decomposition.PCA(n_components='mle') print 'PCA max features to keep: %d' % (max_pca_components) Xtr = scaler.fit_transform( Xtr ) # fit only for train data (http://cs231n.github.io/neural-networks-2/#datapre) Xvl = scaler.transform(Xvl) #
from sklearn.svm import LinearSVC from sklearn.metrics import precision_score, recall_score from sklearn.metrics import confusion_matrix, classification_report from sklearn.externals.joblib import Memory from utils import format_data # Using joblib allows to cache some of the results, in order to gain time on # computation mem = Memory(cachedir='.') ################################################################################ # Load the data X, y = format_data() ################################################################################ # Split data into training set and testing set print "Splitting the data" X_train, X_test = X[0:3000], X[3000:6000] y_train, y_test = y[0:3000], y[3000:6000] ################################################################################ # Train the SVM classification model print "Training the classification model" cs = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 1] csx = range(len(cs)) precisions = [] recalls = [] for c in cs:
import plot import utils DATASETS = ["LGA", "SFO", "MDW", "ORD"] # All final images are ORD HMM_CLASSES = [2, 4, 6, 8, 10] NUM_TRAIN = 20000 np.set_printoptions(formatter={'float': '{:05.2f}'.format}) warnings.filterwarnings("ignore") for DATASET in DATASETS: with open("data/" + DATASET + ".csv", 'rb') as rawcsv: our_csv = csv.reader(rawcsv, delimiter=',') data, dates, wind = utils.format_data(our_csv) orig_data = np.array(data) print("\n##### " + DATASET + " #####") print(data.shape) print # overall standard deviations of data std = np.std(data, axis=0) # HMM class estimates train = data[0:NUM_TRAIN].astype(int) test = data[NUM_TRAIN:].astype(int) for ii in HMM_CLASSES: # Run Gaussian HMM
def classifier_rbf(): X, y = format_data() clf = SVC(C=10, gamma=0.002) clf = mem.cache(clf.fit)(X, y) return clf
def network_model(subject_id, model_type, data_type, cropped, cuda, parameters, hyp_params): best_params = dict() # dictionary to store hyper-parameter values #####Parameter passed to funciton##### max_epochs = parameters['max_epochs'] max_increase_epochs = parameters['max_increase_epochs'] batch_size = parameters['batch_size'] #####Constant Parameters##### best_loss = 100.0 # instatiate starting point for loss iterator = BalancedBatchSizeIterator(batch_size=batch_size) stop_criterion = Or([MaxEpochs(max_epochs), NoDecrease('valid_misclass', max_increase_epochs)]) monitors = [LossMonitor(), MisclassMonitor(), RuntimeMonitor()] model_constraint = MaxNormDefaultConstraint() epoch = 4096 #####Collect and format data##### if data_type == 'words': data, labels = format_data(data_type, subject_id, epoch) data = data[:,:,768:1280] # within-trial window selected for classification elif data_type == 'vowels': data, labels = format_data(data_type, subject_id, epoch) data = data[:,:,512:1024] elif data_type == 'all_classes': data, labels = format_data(data_type, subject_id, epoch) data = data[:,:,768:1280] x = lambda a: a * 1e6 # improves numerical stability data = x(data) data = normalize(data) data, labels = balanced_subsample(data, labels) # downsampling the data to ensure equal classes data, _, labels, _ = train_test_split(data, labels, test_size=0, random_state=42) # redundant shuffle of data/labels #####model inputs##### unique, counts = np.unique(labels, return_counts=True) n_classes = len(unique) n_chans = int(data.shape[1]) input_time_length = data.shape[2] #####k-fold nested corss-validation##### num_folds = 4 skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=10) out_fold_num = 0 # outer-fold number cv_scores = [] #####Outer=Fold##### for inner_ind, outer_index in skf.split(data, labels): inner_fold, outer_fold = data[inner_ind], data[outer_index] inner_labels, outer_labels = labels[inner_ind], labels[outer_index] out_fold_num += 1 # list for storing cross-validated scores loss_with_params = dict()# for storing param values and losses in_fold_num = 0 # inner-fold number #####Inner-Fold##### for train_idx, valid_idx in skf.split(inner_fold, inner_labels): X_Train, X_val = inner_fold[train_idx], inner_fold[valid_idx] y_train, y_val = inner_labels[train_idx], inner_labels[valid_idx] in_fold_num += 1 train_set = SignalAndTarget(X_Train, y_train) valid_set = SignalAndTarget(X_val, y_val) loss_with_params[f"Fold_{in_fold_num}"] = dict() ####Nested cross-validation##### for drop_prob in hyp_params['drop_prob']: for loss_function in hyp_params['loss']: for i in range(len(hyp_params['lr_adam'])): model = None # ensure no duplication of models # model, learning-rate and optimizer setup according to model_type if model_type == 'shallow': model = ShallowFBCSPNet(in_chans=n_chans, n_classes=n_classes, input_time_length=input_time_length, n_filters_time=80, filter_time_length=40, n_filters_spat=80, pool_time_length=75, pool_time_stride=25, final_conv_length='auto', conv_nonlin=square, pool_mode='max', pool_nonlin=safe_log, split_first_layer=True, batch_norm=True, batch_norm_alpha=0.1, drop_prob=drop_prob).create_network() lr = hyp_params['lr_ada'][i] optimizer = optim.Adadelta(model.parameters(), lr=lr, rho=0.9, weight_decay=0.1, eps=1e-8) elif model_type == 'deep': model = Deep4Net(in_chans=n_chans, n_classes=n_classes, input_time_length=input_time_length, final_conv_length='auto', n_filters_time=20, n_filters_spat=20, filter_time_length=10, pool_time_length=3, pool_time_stride=3, n_filters_2=50, filter_length_2=15, n_filters_3=100, filter_length_3=15, n_filters_4=400, filter_length_4=10, first_nonlin=leaky_relu, first_pool_mode='max', first_pool_nonlin=safe_log, later_nonlin=leaky_relu, later_pool_mode='max', later_pool_nonlin=safe_log, drop_prob=drop_prob, double_time_convs=False, split_first_layer=False, batch_norm=True, batch_norm_alpha=0.1, stride_before_pool=False).create_network() #filter_length_4 changed from 15 to 10 lr = hyp_params['lr_ada'][i] optimizer = optim.Adadelta(model.parameters(), lr=lr, weight_decay=0.1, eps=1e-8) elif model_type == 'eegnet': model = EEGNetv4(in_chans=n_chans, n_classes=n_classes, final_conv_length='auto', input_time_length=input_time_length, pool_mode='mean', F1=16, D=2, F2=32, kernel_length=64, third_kernel_size=(8,4), drop_prob=drop_prob).create_network() lr = hyp_params['lr_adam'][i] optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0, eps=1e-8, amsgrad=False) set_random_seeds(seed=20190629, cuda=cuda) if cuda: model.cuda() torch.backends.cudnn.deterministic = True model = torch.nn.DataParallel(model) log.info("%s model: ".format(str(model))) loss_function = loss_function model_loss_function = None #####Setup to run the selected model##### model_test = Experiment(model, train_set, valid_set, test_set=None, iterator=iterator, loss_function=loss_function, optimizer=optimizer, model_constraint=model_constraint, monitors=monitors, stop_criterion=stop_criterion, remember_best_column='valid_misclass', run_after_early_stop=True, model_loss_function=model_loss_function, cuda=cuda, data_type=data_type, subject_id=subject_id, model_type=model_type, cropped=cropped, model_number=str(out_fold_num)) model_test.run() model_loss = model_test.epochs_df['valid_loss'].astype('float') current_val_loss = current_loss(model_loss) loss_with_params[f"Fold_{in_fold_num}"][f"{drop_prob}/{loss_function}/{lr}"] = current_val_loss ####Select and train optimized model##### df = pd.DataFrame(loss_with_params) df['mean'] = df.mean(axis=1) # compute mean loss across k-folds writer_df = f"results_folder\\results\\S{subject_id}\\{model_type}_parameters.xlsx" df.to_excel(writer_df) best_dp, best_loss, best_lr = df.loc[df['mean'].idxmin()].__dict__['_name'].split("/") # extract best param values if str(best_loss[10:13]) == 'nll': best_loss = F.nll_loss elif str(best_loss[10:13]) == 'cro': best_loss = F.cross_entropy print(f"Best parameters: dropout: {best_dp}, loss: {str(best_loss)[10:13]}, lr: {best_lr}") #####Train model on entire inner fold set##### torch.backends.cudnn.deterministic = True model = None #####Create outer-fold validation and test sets##### X_valid, X_test, y_valid, y_test = train_test_split(outer_fold, outer_labels, test_size=0.5, random_state=42, stratify=outer_labels) train_set = SignalAndTarget(inner_fold, inner_labels) valid_set = SignalAndTarget(X_valid, y_valid) test_set = SignalAndTarget(X_test, y_test) if model_type == 'shallow': model = ShallowFBCSPNet(in_chans=n_chans, n_classes=n_classes, input_time_length=input_time_length, n_filters_time=60, filter_time_length=5, n_filters_spat=40, pool_time_length=50, pool_time_stride=15, final_conv_length='auto', conv_nonlin=relu6, pool_mode='mean', pool_nonlin=safe_log, split_first_layer=True, batch_norm=True, batch_norm_alpha=0.1, drop_prob=0.1).create_network() #50 works better than 75 optimizer = optim.Adadelta(model.parameters(), lr=2.0, rho=0.9, weight_decay=0.1, eps=1e-8) elif model_type == 'deep': model = Deep4Net(in_chans=n_chans, n_classes=n_classes, input_time_length=input_time_length, final_conv_length='auto', n_filters_time=20, n_filters_spat=20, filter_time_length=5, pool_time_length=3, pool_time_stride=3, n_filters_2=20, filter_length_2=5, n_filters_3=40, filter_length_3=5, n_filters_4=1500, filter_length_4=10, first_nonlin=leaky_relu, first_pool_mode='mean', first_pool_nonlin=safe_log, later_nonlin=leaky_relu, later_pool_mode='mean', later_pool_nonlin=safe_log, drop_prob=0.1, double_time_convs=False, split_first_layer=True, batch_norm=True, batch_norm_alpha=0.1, stride_before_pool=False).create_network() optimizer = AdamW(model.parameters(), lr=0.1, weight_decay=0) elif model_type == 'eegnet': model = EEGNetv4(in_chans=n_chans, n_classes=n_classes, final_conv_length='auto', input_time_length=input_time_length, pool_mode='mean', F1=16, D=2, F2=32, kernel_length=64, third_kernel_size=(8,4), drop_prob=0.1).create_network() optimizer = optim.Adam(model.parameters(), lr=0.1, weight_decay=0, eps=1e-8, amsgrad=False) if cuda: model.cuda() torch.backends.cudnn.deterministic = True #model = torch.nn.DataParallel(model) log.info("Optimized model") model_loss_function=None #####Setup to run the optimized model##### optimized_model = op_exp(model, train_set, valid_set, test_set=test_set, iterator=iterator, loss_function=best_loss, optimizer=optimizer, model_constraint=model_constraint, monitors=monitors, stop_criterion=stop_criterion, remember_best_column='valid_misclass', run_after_early_stop=True, model_loss_function=model_loss_function, cuda=cuda, data_type=data_type, subject_id=subject_id, model_type=model_type, cropped=cropped, model_number=str(out_fold_num)) optimized_model.run() log.info("Last 5 epochs") log.info("\n" + str(optimized_model.epochs_df.iloc[-5:])) writer = f"results_folder\\results\\S{subject_id}\\{data_type}_{model_type}_{str(out_fold_num)}.xlsx" optimized_model.epochs_df.iloc[-30:].to_excel(writer) accuracy = 1 - np.min(np.array(optimized_model.class_acc)) cv_scores.append(accuracy) # k accuracy scores for this param set. #####Print and store fold accuracies and mean accuracy##### print(f"Class Accuracy: {np.mean(np.array(cv_scores))}") results_df = pd.DataFrame(dict(cv_scores=cv_scores, cv_mean=np.mean(np.array(cv_scores)))) writer2 = f"results_folder\\results\\S{subject_id}\\{data_type}_{model_type}_cvscores.xlsx" results_df.to_excel(writer2) return optimized_model, np.mean(np.array(cv_scores))
import csv import numpy as np from sklearn import manifold, decomposition, cluster import plot import utils # Only looks at MDW cuz this is slow, # and MDW offers enough interesting things to see (or lack thereof) with open('data/MDW.csv', 'rb') as MDWcsv: # np.set_printoptions(threshold=np.inf, suppress=True) csv = csv.reader(MDWcsv, delimiter=',') data = utils.format_data(csv)[0] print("Using PCA, n=2") pca = decomposition.PCA(n_components=2) output = pca.fit_transform(data) plot.plot2d(tuple(output[::7].T), "humidity-pca", c=data[::7, 7], a=0.8) plot.plot2d(tuple(output[::7].T), "temperature-pca", c=data[::7, 1], a=0.8) print("Using PCA, n=3") pca = decomposition.PCA(n_components=3) output = pca.fit_transform(data) plot.plot3d_anim(tuple(output[::7].T), "humidity-pca3d", c=data[::7, 7], a=0.8) plot.plot3d_anim(tuple(output[::7].T), "temperature-pca3d", c=data[::7, 1], a=0.8)
'''Traing''' import utils import svmutil from grid import find_parameters TRAIN_DATA = 'data/training_data_libsvm' TEST_DATA = 'data/testing_data_libsvm' BARE_DATA = 'data/training.data' MODEL_PATH = 'model/speech.model' training_data, testing_data = utils.load_data(BARE_DATA, 20000) utils.format_data(training_data, TRAIN_DATA) utils.format_data(testing_data, TEST_DATA)
def objective(trial): # Open data file f = h5py.File(DT_FL, "r") dt = f[DT_DST] # Format data for LSTM training x_data, y_data = format_data(dt, wd=WD, get_y=True) x_data = np.squeeze(x_data) # Split data and get slices idxs = split(x_data.shape[0], N_TRAIN, N_VALID) slc_trn, slc_vld, slc_tst = slicer(x_data.shape, idxs) # Get data x_train = x_data[slc_trn[0]] y_train = y_data[slc_trn[0]] - x_train x_val = x_data[slc_vld[0]] y_val = y_data[slc_vld[0]] - x_val # Limits and options # Filters # n_lstm = [[4, 128], [4, 128], [4, 128]] n_lstm = [[4, 196], [4, 196], [4, 196]] # Regularizer l2_lm = [1e-7, 1e-3] # Activation functions act_opts = ["relu", "elu", "tanh", "linear"] # Latent space cfg lt_sz = [5, 150] lt_dv = [0.3, 0.7] # Learning rate lm_lr = [1e-5, 1] # Clear tensorflow session tf.keras.backend.clear_session() # Input inputs = layers.Input(shape=x_train.shape[1:]) p = inputs # Dense layers # n_lyr_dense = trial.suggest_int("n_lyr_dense", 0, 2) n_lyr_dense = trial.suggest_int("n_lyr_dense", 1, 3) for i in range(n_lyr_dense): # For the current layer # Get number of filters l = trial.suggest_int("n{}_dense".format(i), n_lstm[i][0], n_lstm[i][1]) # Get the activation function act = trial.suggest_categorical("d{}_activation".format(i), act_opts) # Regularization value l2 = trial.suggest_loguniform("d{}_l2".format(i), l2_lm[0], l2_lm[1]) l2_reg = regularizers.l2(l=l2) # Set layer p = layers.Dense( l, activation=act, # kernel_regularizer=l2_reg, name="{}_dense".format(i + 1), )(p) # Dropout dp = trial.suggest_uniform("d{}_dropout".format(i), 0, 1) p = layers.Dropout(dp, name="{}_dropout_dense".format(i + 1))(p) bn = trial.suggest_categorical("d{}_batchnorm".format(i), [0, 1]) if bn == 1: p = layers.BatchNormalization(name="{}_bnorm_dense".format(i + 1))(p) out = layers.Dense(y_data.shape[1], activation="linear")(p) pred = Model(inputs, out, name="auto_encoder_add") # opt_opts = ["adam", "nadam", "adamax", "RMSprop"] # opt = trial.suggest_categorical("optimizer", opt_opts) opt = "adam" if opt == "adam": k_optf = optimizers.Adam elif opt == "nadam": k_optf = optimizers.Nadam elif opt == "adamax": k_optf = optimizers.Adamax elif opt == "RMSprop": k_optf = optimizers.RMSprop lr = trial.suggest_loguniform("lr", lm_lr[0], lm_lr[1]) if lr > 0: k_opt = k_optf(learning_rate=lr) else: k_opt = k_optf() pred.compile(optimizer=k_opt, loss="mse", metrics=["mse", loss_norm_error]) batch_size = int(trial.suggest_uniform("batch_sz", 2, 32)) pred.summary() hist = pred.fit( x_train, y_train, epochs=100, batch_size=batch_size, shuffle=True, validation_data=(x_val, y_val), callbacks=[KerasPruningCallback(trial, "val_mse")], verbose=1, ) txt = PREFIX + SUFFIX pred.save(txt.format(RUN_VERSION, trial.number)) return hist.history["val_mse"][-1]
if __name__ == '__main__': logger = logging.getLogger('trainlogger') logger.setLevel(logging.INFO) formatter = logging.Formatter( fmt='%(levelname)s\t%(asctime)s\t%(message)s', datefmt='%Y-%m-%dT%H:%M:%S') handler = logging.FileHandler('./logs/train.log', 'a') handler.setFormatter(formatter) logger.addHandler(handler) if sys.argv[1] == 'init': logger.info('init data') utils.format_data(config.YULIAO, config.CUT_WORDS) utils.make_train_test(config.CUT_WORDS) utils.gen_vocabulary_file(config.TRAIN_ENC_FILE, config.TRAIN_ENC_VOCABULARY, None) utils.gen_vocabulary_file(config.TRAIN_DEC_FILE, config.TRAIN_DEC_VOCABULARY, None) utils.convert_to_vec(config.TRAIN_ENC_FILE, config.TRAIN_ENC_VOCABULARY, config.TRAIN_ENC_VEC) utils.convert_to_vec(config.TRAIN_DEC_FILE, config.TRAIN_DEC_VOCABULARY, config.TRAIN_DEC_VEC) utils.convert_to_vec(config.TEST_ENC_FILE, config.TRAIN_ENC_VOCABULARY, config.TEST_ENC_VEC) utils.convert_to_vec(config.TEST_DEC_FILE, config.TRAIN_DEC_VOCABULARY,
dt_dst = "scaled_data" # The percentage for the test is implicit n_train = 0.8 n_valid = 0.1 # Select the variable to train # 0: Temperature - 1: Pressure - 2: Velocity - None: all var = 2 # %% # Open data file f = h5py.File(dt_fl, "r") dt = f[dt_dst] x_data, y_data = format_data(dt, wd=3, var=2, get_y=True, cont=True) # Split data file idxs = split(x_data.shape[0], n_train, n_valid) slc_trn, slc_vld, slc_tst = slicer(x_data.shape, idxs) # Slice data x_train = x_data[slc_trn] x_val = x_data[slc_vld] slc_trn, slc_vld, slc_tst = slicer(y_data.shape, idxs) y_train = y_data[slc_trn] y_val = y_data[slc_vld] # %% # LSTM neural network settings
def __init__(self, optfun, X, Y, noise, kernel, bounds=None, burnin=500, resample=50, n_init=1, tol=1e-6, sobol_seed=1991, sampler=MDSliceSampler, sampler_args={}): assert (isinstance(kernel, BaseKernel)) self._dim = X.shape[1] if X.shape.__len__( ) > 1 else 1 # get dimension of input space self.optfun = optfun self.X = np.copy(format_data(X, dim=self._dim)) self.Y = np.copy(format_data(Y, dim=1)) self.noise = noise self.kernel = kernel self.burnin = burnin self.resample = resample self.tol = tol self._sobol_seed = sobol_seed if bounds == None: bounds = [] for i in range(self._dim): bounds.append((0., 1.)) assert (len(bounds) == self._dim) self.bounds = bounds if self.X.shape[0] == 0: X_new, Y_new = self._random_search(n_init) self.X, self.Y = np.concatenate( (self.X, X_new), axis=0), np.concatenate((self.Y, Y_new), axis=0) self._nu = npr.randn(self.X.shape[0], 1) self._has_noise_prior = False if isinstance(noise, Parameter): self._has_noise_prior = True # get initial values of kernel hyperparameters kernel_parameters = kernel.get_valid_parameters() x0 = np.zeros( len(kernel_parameters) + (1 if self._has_noise_prior else 0)) for i, par in enumerate(kernel_parameters): x0[i] = par.value if self._has_noise_prior: x0[-1] = self.noise.value self._recompute = True # get bounds on kernel parameters, see if you should marginalize bounds = [] for par in kernel_parameters: bounds.append(par.prior.support) if self._has_noise_prior: bounds.append(self.noise.prior.support) if bounds.__len__() > 0: self.sampler = sampler(self._parameter_posterior, x0, bounds=bounds, log=True, burnin=burnin, **sampler_args) self.marginalize = True else: self.sampler = None self.marginalize = False
def classifier(): X, y = format_data() clf = LinearSVC(C=0.005) clf = mem.cache(clf.fit)(X, y) return clf