def get_label(datapath, label_col=3): labels = [] with open(datapath, "rt") as f: fcsv = csv.reader(f, delimiter = "\t") header = next(fcsv) for row in fcsv: labels.append(int(row[label_col])) labels = np.array(labels) return onehot_encode(labels)
def gen_last_source(): df = utils.load_enroll() log_df = utils.load_log() log_sz = len(log_df.groupby('enrollment_id')) log_df = log_df.groupby('enrollment_id').agg({ 'source': 'last' }).reset_index() df = df.merge(log_df, how='left', on='enrollment_id').fillna(-1) return {'X': utils.onehot_encode(df['source'])}
def gen_last_category(): df = utils.load_enroll() log_df = utils.load_log_with_obj_attrib() log_sz = len(log_df.groupby('enrollment_id')) log_df = log_df.groupby('enrollment_id').agg({ 'category': 'last' }).reset_index() df = df.merge(log_df, how='left', on='enrollment_id').fillna(-1) return {'X': utils.onehot_encode(df['category'])}
def random_prediction(input_matrix): """ Returns prediction matrix (one-hot encoded). """ num_asts, num_timesteps, num_blocks = np.shape(input_matrix) decoded_input = np.zeros((num_asts, num_timesteps)) decoded_predictions = np.zeros(np.shape(decoded_input)) predictions = np.zeros(np.shape(input_matrix)) for ast_id in range(num_asts): block_indices = utils.onehot_decode(input_matrix[ast_id, :, :]).T end_block_mask = (block_indices == (num_blocks - 1)) end_block_fill = (num_blocks - 1) * np.ones((1, num_timesteps)) random_fill = np.random.randint(num_blocks, size=(1, num_timesteps)) decoded_predictions[ast_id, :] = \ end_block_mask * end_block_fill + \ (end_block_mask == False) * random_fill predictions[ast_id,:,:] = \ utils.onehot_encode(decoded_predictions[ast_id, :], num_blocks) return predictions
print("loaded %d gene sets" % (len(gene_sets))) else: gene_sets = {"all_genes": df_genes} # select gene set try: name = args.set genes = gene_sets[name] except: print("gene set is not the subset file provided") sys.exit(1) # extract dataset X = df[genes] y = utils.onehot_encode(labels, classes) # create train/test sets x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2) # normalize dataset scaler = sklearn.preprocessing.MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) # get mu and sigma of target class feature vectors target_data = x_train[np.argmax(y_train, axis=1) == args.target] target_mu = np.mean(target_data, axis=0) target_cov = np.cov(target_data, rowvar=False)
missing_genes = [g for g in genes if g not in df_genes] gene_sets = [(name, [g for g in genes if g in df_genes]) for (name, genes) in gene_sets] print( "%d / %d genes from gene sets were not found in the input dataset" % (len(missing_genes), len(genes))) else: gene_sets = [] # train a model for each gene set for name, genes in gene_sets: # extract dataset X = df[genes] y = utils.onehot_encode(labels, range(len(classes))) # create train/test sets x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, test_size=0.3) # normalize dataset Scaler = sklearn.preprocessing.MinMaxScaler x_train = Scaler().fit_transform(x_train) x_test = Scaler().fit_transform(x_test) # get mu and sigma of target class feature vectors target_data = x_train[np.argmax(y_train, axis=1) == args.target] target_mu = np.mean(target_data, axis=0) target_cov = np.cov(target_data, rowvar=False)
import matplotlib.pyplot as plt import numpy as np from NeuralNetwork import * from Layer import * #mnist.init() X_train, Y_train, X_test, Y_test = mnist.load() # Pre-process data. For computers with less RAM, we must slice the training set X_train = X_train[0:50000] Y_train = Y_train[0:50000] X_train, X_test = (X_train / 127.5) - 1, (X_test / 127.5) - 1 X_train = utils.bias_trick(X_train) X_test = utils.bias_trick(X_test) Y_train, Y_test = utils.onehot_encode(Y_train), utils.onehot_encode(Y_test) X_train, Y_train, X_val, Y_val = utils.train_val_split(X_train, Y_train, 0.1) #Add Layers hidden_layer = Layer(num_input=X_train.shape[1], num_neurons=64, activation_func=utils.relu, activation_func_der=utils.relu_der) output_layer = Layer( num_input=64, num_neurons=Y_train.shape[1], activation_func=utils.softmax ) #The derivation of the activation function of the output layer is not used #Create network model = NeuralNetwork(max_epochs=20, learning_rate=0.01,
def gen_onehot_user_by_enrollment(): df = utils.load_enroll() df['username'] = df['username'].apply(lambda x: x[:6]) X = utils.onehot_encode(df['username'], sparse=True) return {'X': X}
def gen_onehot_course_by_enrollment(): df = utils.load_enroll() X = utils.onehot_encode(df['course_id'], sparse=True) return {'X': X}
else: gene_sets = {'all_genes': df_perturb.columns} # select gene set try: name = args.set genes = gene_sets[name] except: print('gene set is not the subset file provided') sys.exit(1) # extract train/perturb data x_train = df_train[genes] x_perturb = df_perturb[genes] y_train = utils.onehot_encode(y_train, classes) y_perturb = utils.onehot_encode(y_perturb, classes) genes = x_train.columns # normalize perturb data (using the train data) scaler = sklearn.preprocessing.MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_perturb = scaler.transform(x_perturb) # perturb each class mean to the target class mu_perturbed = perturb_mean_diff(x_train, y_train, args.target, classes) # save mean peturbations to dataframe
def gen_user_cat(): df = utils.load_enroll() X = utils.onehot_encode(df['username']) return {'X': X}
def gen_course_cat(): df = utils.load_enroll() X = utils.onehot_encode(df['course_id']) return {'X': X}
else: gene_sets = {"all_genes": df_test.columns} # select gene set try: name = args.set genes = gene_sets[name] except: print("gene set is not the subset file provided") sys.exit(1) # extract train/test data x_train = df_train[genes] x_test = df_test[genes] y_train = utils.onehot_encode(y_train, classes) y_test = utils.onehot_encode(y_test, classes) # normalize test data (using the train data) scaler = sklearn.preprocessing.MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) # perturb each class mean to the target class mu_pert = perturb_mean_diff(x_test, y_test, args.target, classes) # save mean peturbations to dataframe df_pert = pd.DataFrame(data=mu_pert, index=genes, columns=classes)
def test_encoding(self): array = np.array([1, 4]) num_classes = 5 one_hot_matrix = utils.onehot_encode(array, num_classes) answer = np.array([[0, 1, 0, 0, 0], [0, 0, 0, 0, 1]]) self.assertTrue(np.array_equal(one_hot_matrix, answer))