def preprocessing(problem): path = os.path.join('datasets', '{}.csv'.format(problem)) data = np.genfromtxt(path, delimiter=',') inputs = data[:, :-1] labels = data[:, -1] n_classes = len(np.unique(labels)) n_dims = inputs.shape[1] # one-hot code targets if np.min(labels) != 0: labels -= 1 # need dummy code to start at zero for this to work labels = labels.astype(int) labels = np.eye(n_classes)[np.array(labels)] # norm data to be between -1 and 1 if problem[:-1] != 'shj': inputs -= np.min(inputs, axis=0) inputs /= np.ptp(inputs, axis=0) inputs *= 2 inputs -= 1 full_set = np.append(inputs, labels, 1) else: full_set = np.append(inputs, labels, 1) full_set = np.concatenate((full_set, full_set), axis=0) # to match Nosofsky+ '94 return [full_set, n_classes, n_dims]
def organize_data_from_txt(data_filepath, delimiter=','): data = np.genfromtxt(data_filepath, delimiter=delimiter) data = { 'inputs': data[:, :-1], 'labels': data[:, -1], 'categories': np.unique(data[:, -1]), } # map categories to label indices data['idx_map'] = { category: idx for category, idx in zip(data['categories'], range(len(data['categories']))) } # map original labels to label indices data['labels_indexed'] = [ data['idx_map'][label] for label in data['labels'] ] # generate one hot targets data['one_hot_targets'] = np.eye(len( data['categories']))[data['labels_indexed']] return data
def generate_test_and_train(path, min_table_size, max_table_size, train_test_split, excluded_table_sizes=[]): runtime_data = np.genfromtxt(path, delimiter=',') # remove header and operator column runtime_data = runtime_data[1:, 0:6] # 0 1 2 3 4 5 # tablerows,tablesizekb,selectivity,tuplewidth,attrInPred,runtime,operator runtime_data = runtime_data[(runtime_data[:, 1] >= min_table_size) & (runtime_data[:, 1] <= max_table_size)] for excluded_table_size in excluded_table_sizes: runtime_data = runtime_data[runtime_data[:, 1] != excluded_table_size] # usual train test split np.random.seed(42) np.random.shuffle(runtime_data) size_split_point = int(len(runtime_data) * train_test_split) test_data = runtime_data[size_split_point:] training_data = runtime_data[:size_split_point] return training_data, test_data
def plotTrainingCurve(): X = np.genfromtxt(r'dkfTrainTest.csv', delimiter=',') t = np.arange(X.shape[0]) plt.clf() plt.plot(t, X) #plt.plot(t,X[:,1]) #plt.legend(['Train', 'Test']) plt.savefig('trainingCurvedkf.jpg')
def load_data(self): # load data data = np.matrix( np.genfromtxt( '../../mlrefined_datasets/superlearn_datasets/bacteria_data.csv', delimiter=',')) self.x = np.asarray(data[:, 0]) self.y = np.asarray(data[:, 1])
def get_data(): #get all data input and output from the csv file all_data=np.genfromtxt("Data_for_UCI_named.csv", delimiter=",") data=all_data[1:,:-1] n=np.size(data,0) for i in range(n): if data[i,-1]<0: data[i,-1]=0 else: data[i,-1]=1 return data
def get_data(): all_data=np.genfromtxt("Data_for_UCI_named.csv", delimiter=",") data=all_data[1:,:-1] n=np.size(data,0) # df = pd.read_csv("Data_for_UCI_named.csv") # saved_column = df['stabf'] #y=np.zeros([n,1]) bias_ones=np.ones([n,1]) data=np.concatenate((bias_ones, data), axis=1) for i in range(n): if data[i,-1]<0: data[i,-1]=0 else: data[i,-1]=1 return data
def load_P(P_loc): P_all_inv = np.genfromtxt(P_loc, delimiter=',') # input motion parameters are assumed to be not based on sampling # but based on coordinate transformation, therefore neeed to invert H_all_inv = p_to_H(np.expand_dims(P_all_inv, axis=2)) H_all = np.linalg.inv(H_all_inv) P_all = H_to_p(H_all) P_init = P_all[0, :] P = P_all[1:, :] P_init = np.expand_dims(P_init, axis=0) return P_init, P
def getdata(): my_data = np.genfromtxt('Data_for_UCI_named.csv', delimiter=',') my_data = my_data[1:10001, :] mydatasize = np.size(my_data, 0) tdata_size = (2 * mydatasize) // 3 t_data = my_data[0:tdata_size, :] x_star = t_data[:, 0:12] y = np.ceil(t_data[:, 12][:, None]) shape = x_star.shape N = shape[0] return x_star, y, N, my_data, tdata_size
bottom = x[j, 2 * i] * w[i] + x[j, 2 * i + 1] * (1 - w[i]) currout.append(top) currout.append(bottom) out.append(currout) out = np.asarray(out) return out # In[11]: w_init = scatter_layer_weights(1) w_init # In[12]: X = np.genfromtxt('data/scatter02_T10_all_in.csv', delimiter=',').T y = np.genfromtxt('data/scatter02_T10_all_out.csv', delimiter=',').T print(X.shape) print(y.shape) # In[25]: # our "predict" function def propagate(x, w): out = scatter_layer(x, w) #print("\nScatter: 1") for i in range(9): out = prop_layer(out) out = scatter_layer(out, w) #print("Scatter: " + str(i+2))
def load_data(filename, sample=True): Y = np.genfromtxt(filename, delimiter=",") Y = ((Y.T - Y.T.mean(axis=0)) / (Y.T.std(axis=0))).T return Y[:, ::100] if sample else Y
# forward(params, inputs = inputs, channels = channels, hps = hps)[-1] # ) # ), # axis = 2, keepdims = True # ), # axis = 0 # )[:,0] # - - - - - - - - - - - - - - - - - - if __name__ == '__main__': import utils # data = np.genfromtxt('iris.csv', delimiter = ',') # data = np.genfromtxt('mamm.csv', delimiter = ',') data = np.genfromtxt('leaf.csv', delimiter=',') inputs = data[:, :-1] labels = data[:, -1] categories = np.unique(labels) idx_map = { category: idx for category, idx in zip(categories, range(len(categories))) } labels_indexed = [idx_map[label] for label in labels] one_hot_targets = np.eye(len(categories))[labels_indexed] hps = { 'lr': .05, # <-- learning rate 'wr': [-.1, .1], # <-- weight range
if __name__ == "__main__": N = 10000 M = 12 learning_rate = 1e-3 iter_num = 20000 batch_size = 32 training_size = int(N * (3 / 3)) X = np.zeros((N, 12)) csv_file = 'Data_for_UCI_named.csv' X_0 = np.genfromtxt(csv_file, delimiter=',') X = X_0[1:, 0:12] Y = np.zeros(np.shape(X)[0]) Y_0 = np.genfromtxt(csv_file, delimiter=',', usecols=(-1), dtype=np.str)[1:] bad_chars = '"' '' for i in range(0, np.shape(Y_0)[0]): s = Y_0[i] for c in bad_chars: s = s.replace(c, "") if s == "unstable": Y[i] = 0 else:
if __name__ == '__main__': random = 1 n_samples = 10 n_samples_to_test = 100 num_pseudo_params = 50 dimensions =[1,1,1] n_layers = len(dimensions)-1 npr.seed(0) #Randomness comes from KMeans rs = npr.RandomState(0) motor = np.genfromtxt('motor.csv', delimiter=',',skip_header = True) X = motor[:,1] X = (X - np.mean(X))/(np.std(X)) X = X.reshape(len(X),1) y = motor[:,2] y = (y-np.mean(y))/(np.std(y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42) total_num_params, log_likelihood, sample_mean_cov_from_deep_gp, predict_layer_funcs, squared_error, create_deep_map = \ build_deep_gp(dimensions, rbf_covariance, num_pseudo_params, random) init_params = .1 * npr.randn(total_num_params) deep_map = create_deep_map(init_params) init_params = initialize(deep_map,X,num_pseudo_params)
runEstimates = True computeCounterfactuals = False data_dir_base = projectFiles + "data/" results_dir_base = projectFiles + "results/" dataPath = data_dir_base + size resultsPath = results_dir_base + size estimatesPath = resultsPath + "estimates/" counterfactualsPath = resultsPath + "counterfactuals/" estimatesPath = resultsPath + "estimates/" # Economic Parameters beta = np.genfromtxt(dataPath + 'beta.csv', delimiter=',') theta = np.genfromtxt(dataPath + 'theta.csv', delimiter=',') mu = np.genfromtxt(dataPath + 'mu.csv', delimiter=',') nu = np.genfromtxt(dataPath + 'nu.csv', delimiter=',') params = {"beta": beta, "theta": theta, "mu": mu, "nu": nu} # Data tau = np.genfromtxt(dataPath + 'tau.csv', delimiter=',') Xcif = np.genfromtxt(dataPath + 'Xcif.csv', delimiter=',') Y = np.genfromtxt(dataPath + 'Y.csv', delimiter=',') Eq = np.genfromtxt(dataPath + 'Eq.csv', delimiter=',') Ex = np.genfromtxt(dataPath + 'Ex.csv', delimiter=',') r = np.genfromtxt(dataPath + 'r.csv', delimiter=',') D = np.genfromtxt(dataPath + 'D.csv', delimiter=',') ccodes = np.genfromtxt(dataPath + 'ccodes.csv', delimiter=',', dtype="str")
def read_housing_csv(self, file_name, mapping_state, target_name=None): data = np.genfromtxt(file_name, delimiter=',', dtype='unicode', skip_header=1) if (self.config.NN_DEBUG_SHAPES): print(data.shape) if (target_name is None): skip_cols = 1 else: skip_cols = 2 # clean up feature-wise map_id = 1.0 # Dont make a feature irrelevant by making it 0 # Map known mappings mapping_state["NA"] = 0.0 mapping_state["No"] = 0.0 mapping_state["N"] = 0.0 mapping_state["Unf"] = 0.0 mapping_state["None"] = 0.0 mapping_state["Po"] = 0.0 # Poor mapping_state["Y"] = map_id map_id = map_id + 1 mapping_state["Fa"] = map_id map_id = map_id + 1 mapping_state["TA"] = map_id map_id = map_id + 1 mapping_state["Gd"] = map_id map_id = map_id + 1 mapping_state["Ex"] = map_id map_id = map_id + 1 # Get (samplesize x features per sample) X = np.empty( (data.shape[0], data.shape[1] - skip_cols)) # Dont need Id and Price columns # Perform column-wise, so feature-wise mappings are similar, else they will be random for col in range(data.shape[1] - skip_cols): for row in range(data.shape[0]): try: X[row][col] = data[row][col + 1].astype(float) except: if (data[row][col + 1] in mapping_state): X[row][col] = mapping_state[data[row][col + 1]] else: mapping_state[data[row][col + 1]] = map_id X[row][col] = map_id map_id = map_id + 1.0 # Get groundtruths Y = np.empty((data.shape[0], 1)) if (target_name is not None): for row in range(data.shape[0]): col = data.shape[1] - 1 try: # Take log of saleprice to match the loss calculations Y[row][0] = data[row][col].astype(float) except: raise Exception("Ground truth should be float") # Normalize Y_normalize_state = X_normalize_state = None if (self.config.NN_NORMALIZE): if (target_name is not None): Y, Y_normalize_state = self.utils.normalize0(Y, axis=0) X, X_normalize_state = self.utils.normalize0(X, axis=0) if (self.config.NN_DEBUG_SHAPES): print(X.shape, Y.shape, X, X[0][0].dtype) return X, X_normalize_state, mapping_state, Y, Y_normalize_state
if __name__ == '__main__': random = 1 n_samples = 10 n_samples_to_test = 100 num_pseudo_params = 50 dimensions = [1, 1, 1] n_layers = len(dimensions) - 1 npr.seed(0) #Randomness comes from KMeans rs = npr.RandomState(0) motor = np.genfromtxt('motor.csv', delimiter=',', skip_header=True) X = motor[:, 1] X = (X - np.mean(X)) / (np.std(X)) X = X.reshape(len(X), 1) y = motor[:, 2] y = (y - np.mean(y)) / (np.std(y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) total_num_params, log_likelihood, sample_mean_cov_from_deep_gp, predict_layer_funcs, squared_error, create_deep_map = \ build_deep_gp(dimensions, rbf_covariance, num_pseudo_params, random) init_params = .1 * npr.randn(total_num_params)
import autograd.numpy as np import scipy.optimize import random import sys from autograd import grad if len(sys.argv) != 5: print("args: <trainFile> <trainLabelsFile> <testFile> <testLabelsFile>") exit(1) trainFile = sys.argv[1] trainLabelsFile = sys.argv[2] testFile = sys.argv[3] testLabelsFile = sys.argv[4] trainData = np.genfromtxt(trainFile, delimiter=',', dtype=np.float64) trainLabels = np.genfromtxt(trainLabelsFile, delimiter=',', dtype=np.float64) testData = np.genfromtxt(testFile, delimiter=',', dtype=np.float64) testLabels = np.genfromtxt(testLabelsFile, delimiter=',', dtype=np.float64) def f(theta): objReg = 0.5 / 2.0 * np.dot(theta[1:], theta[1:]) sigmoids = 1.0 / (1.0 + np.exp( np.minimum(300.0, -(theta[0] + np.matmul(trainData, theta[1:]))))) innerSecondTerm = 1.0 - trainLabels + np.multiply( sigmoids, (2.0 * trainLabels - 1.0)) result = np.sum(np.log(innerSecondTerm + 1e-10)) return objReg - result
students = 5 # Each student receives different itiailized weights lessons = 20000 # Each lesson consists of the entire training set iris_data = load_iris() # load the iris dataset x = iris_data.data y_ = iris_data.target.reshape(-1, 1) # Convert data to a single column # One Hot encode the class labels encoder = OneHotEncoder(sparse=False) y = encoder.fit_transform(y_) # # Split the data for training and testing # train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.20) initial_weights = np.genfromtxt(fname='inputs/SF5d_5.dat') #initial_weights = np.genfromtxt(fname='inputs/keras_inputs.txt') train_x = np.genfromtxt(fname='same_split/train_x.txt') train_y = np.genfromtxt(fname='same_split/train_y.txt') test_x = np.genfromtxt(fname='same_split/test_x.txt') test_y = np.genfromtxt(fname='same_split/test_y.txt') train_x = np.delete(np.delete(train_x, 0, 1), 0, 1) test_x = np.delete(np.delete(test_x, 0, 1), 0, 1) x = np.delete(np.delete(x, 0, 1), 0, 1) #Standardization train_x[:, 0] = (train_x[:, 0] - np.mean(train_x[:, 0])) / np.std(train_x[:, 0]) train_x[:,
def read_data(filename): data = np.genfromtxt(filename, dtype=float, delimiter=',', skip_header=1) np.random.shuffle(data) return np.delete(data, [0], axis = 1), data[:,[0]]
def read_housing_csv_2(self, file_name, x_mapping_state, target_name=None): skip_header = 1 if self.config.NN_MULTI_ENCODE_TEXT_VARS or self.config.NN_APPLY_DATA_SCIENCE: skip_header = 0 data = np.genfromtxt(file_name, delimiter=',', dtype='unicode', skip_header=skip_header) if (self.config.NN_DEBUG_SHAPES): print(data.shape) if (target_name is None): skip_cols = 1 else: skip_cols = 2 # Identify Area columns area_cols = [i for i, item in enumerate(data[0, :]) if "Area" in item] # multi-encode if self.config.NN_MULTI_ENCODE_TEXT_VARS: X_data, self.neighborhood_vals = multi_encode_text_variables( "Neighborhood", data, self.neighborhood_vals) X_data = np.delete(X_data, 0, axis=0) # Remove header now data = X_data if self.config.NN_APPLY_DATA_SCIENCE: # Apply some data science data = self.filter_training_data(data, target_name=target_name) X_data = self.augment_training_data(data, target_name=target_name) X_data = np.delete(X_data, 0, axis=0) # Remove header now data = X_data # Get (samplesize x features per sample) X = np.empty( (data.shape[0], data.shape[1] - skip_cols)) # Dont need Id and Price columns for col in range(data.shape[1] - skip_cols): map_id = 1.0 # reset every feature if (target_name is not None): mapping_state = {} else: mapping_state = x_mapping_state[col] if col in area_cols: # Direct mapping for row in range(data.shape[0]): try: X[row][col] = data[row][col + 1].astype(float) except: if (data[row][col + 1] in mapping_state): X[row][col] = mapping_state[data[row][col + 1]] else: mapping_state[data[row][col + 1]] = map_id X[row][col] = map_id map_id = map_id + 1.0 else: for row in range(data.shape[0]): if (data[row][col + 1] in mapping_state): X[row][col] = mapping_state[data[row][col + 1]] else: mapping_state[data[row][col + 1]] = map_id X[row][col] = map_id map_id = map_id + 1.0 x_mapping_state.append(mapping_state) # Get groundtruths Y = np.empty((data.shape[0], 1)) if (target_name is not None): prev = 0.0 for row in range(data.shape[0]): col = data.shape[1] - 1 try: Y[row][0] = data[row][col].astype(float) except: raise Exception("Ground truth should be float") # Ensure GT was sorted before # assert (prev <= Y[row][0]) prev = Y[row][0] if self.config.NN_LOG_TARGET is True: Y[row][0] = np.log(Y[row][0]) # Normalize Y_normalize_state = X_normalize_state = None if (self.config.NN_NORMALIZE): if (target_name is not None): Y, Y_normalize_state = self.utils.normalize0(Y, axis=0) X, X_normalize_state = self.utils.normalize0(X, axis=0) if (self.config.NN_DEBUG_SHAPES): print(X.shape, Y.shape, X, X[0][0].dtype) return X, X_normalize_state, x_mapping_state, Y, Y_normalize_state