示例#1
0
def get_label(datapath, label_col=3):
    labels = []
    with open(datapath, "rt") as f:
        fcsv = csv.reader(f, delimiter = "\t")
        header = next(fcsv)
        for row in fcsv:
            labels.append(int(row[label_col]))
    labels = np.array(labels)
    return onehot_encode(labels)
示例#2
0
def gen_last_source():
    df = utils.load_enroll()
    log_df = utils.load_log()
    log_sz = len(log_df.groupby('enrollment_id'))
    log_df = log_df.groupby('enrollment_id').agg({
        'source': 'last'
    }).reset_index()
    df = df.merge(log_df, how='left', on='enrollment_id').fillna(-1)

    return {'X': utils.onehot_encode(df['source'])}
示例#3
0
def gen_last_category():
    df = utils.load_enroll()
    log_df = utils.load_log_with_obj_attrib()
    log_sz = len(log_df.groupby('enrollment_id'))
    log_df = log_df.groupby('enrollment_id').agg({
        'category': 'last'
    }).reset_index()
    df = df.merge(log_df, how='left', on='enrollment_id').fillna(-1)

    return {'X': utils.onehot_encode(df['category'])}
示例#4
0
def random_prediction(input_matrix):
    """ Returns prediction matrix (one-hot encoded). """
    num_asts, num_timesteps, num_blocks = np.shape(input_matrix)

    decoded_input = np.zeros((num_asts, num_timesteps))
    decoded_predictions = np.zeros(np.shape(decoded_input))
    predictions = np.zeros(np.shape(input_matrix))

    for ast_id in range(num_asts):
        block_indices = utils.onehot_decode(input_matrix[ast_id, :, :]).T
        end_block_mask = (block_indices == (num_blocks - 1))

        end_block_fill = (num_blocks - 1) * np.ones((1, num_timesteps))
        random_fill = np.random.randint(num_blocks, size=(1, num_timesteps))

        decoded_predictions[ast_id, :] = \
            end_block_mask * end_block_fill + \
            (end_block_mask == False) * random_fill

        predictions[ast_id,:,:] = \
            utils.onehot_encode(decoded_predictions[ast_id, :], num_blocks)

    return predictions
示例#5
0
		print("loaded %d gene sets" % (len(gene_sets)))
	else:
		gene_sets = {"all_genes": df_genes}

	# select gene set
	try:
		name = args.set
		genes = gene_sets[name]
	except:
		print("gene set is not the subset file provided")
		sys.exit(1)

	# extract dataset
	X = df[genes]
	y = utils.onehot_encode(labels, classes)

	# create train/test sets
	x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2)

	# normalize dataset
	scaler = sklearn.preprocessing.MinMaxScaler()
	scaler.fit(x_train)

	x_train = scaler.transform(x_train)
	x_test = scaler.transform(x_test)

	# get mu and sigma of target class feature vectors
	target_data = x_train[np.argmax(y_train, axis=1) == args.target]
	target_mu = np.mean(target_data, axis=0)
	target_cov = np.cov(target_data, rowvar=False)
示例#6
0
        missing_genes = [g for g in genes if g not in df_genes]

        gene_sets = [(name, [g for g in genes if g in df_genes])
                     for (name, genes) in gene_sets]

        print(
            "%d / %d genes from gene sets were not found in the input dataset"
            % (len(missing_genes), len(genes)))
    else:
        gene_sets = []

    # train a model for each gene set
    for name, genes in gene_sets:
        # extract dataset
        X = df[genes]
        y = utils.onehot_encode(labels, range(len(classes)))

        # create train/test sets
        x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
            X, y, test_size=0.3)

        # normalize dataset
        Scaler = sklearn.preprocessing.MinMaxScaler
        x_train = Scaler().fit_transform(x_train)
        x_test = Scaler().fit_transform(x_test)

        # get mu and sigma of target class feature vectors
        target_data = x_train[np.argmax(y_train, axis=1) == args.target]
        target_mu = np.mean(target_data, axis=0)
        target_cov = np.cov(target_data, rowvar=False)
示例#7
0
import matplotlib.pyplot as plt
import numpy as np
from NeuralNetwork import *
from Layer import *

#mnist.init()
X_train, Y_train, X_test, Y_test = mnist.load()

# Pre-process data. For computers with less RAM, we must slice the training set
X_train = X_train[0:50000]
Y_train = Y_train[0:50000]

X_train, X_test = (X_train / 127.5) - 1, (X_test / 127.5) - 1
X_train = utils.bias_trick(X_train)
X_test = utils.bias_trick(X_test)
Y_train, Y_test = utils.onehot_encode(Y_train), utils.onehot_encode(Y_test)

X_train, Y_train, X_val, Y_val = utils.train_val_split(X_train, Y_train, 0.1)

#Add Layers
hidden_layer = Layer(num_input=X_train.shape[1],
                     num_neurons=64,
                     activation_func=utils.relu,
                     activation_func_der=utils.relu_der)
output_layer = Layer(
    num_input=64, num_neurons=Y_train.shape[1], activation_func=utils.softmax
)  #The derivation of the activation function of the output layer is not used

#Create network
model = NeuralNetwork(max_epochs=20,
                      learning_rate=0.01,
示例#8
0
def gen_onehot_user_by_enrollment():
    df = utils.load_enroll()
    df['username'] = df['username'].apply(lambda x: x[:6])
    X = utils.onehot_encode(df['username'], sparse=True)
    return {'X': X}
示例#9
0
def gen_onehot_course_by_enrollment():
    df = utils.load_enroll()
    X = utils.onehot_encode(df['course_id'], sparse=True)
    return {'X': X}
示例#10
0
    else:
        gene_sets = {'all_genes': df_perturb.columns}

    # select gene set
    try:
        name = args.set
        genes = gene_sets[name]
    except:
        print('gene set is not the subset file provided')
        sys.exit(1)

    # extract train/perturb data
    x_train = df_train[genes]
    x_perturb = df_perturb[genes]

    y_train = utils.onehot_encode(y_train, classes)
    y_perturb = utils.onehot_encode(y_perturb, classes)

    genes = x_train.columns

    # normalize perturb data (using the train data)
    scaler = sklearn.preprocessing.MinMaxScaler()
    scaler.fit(x_train)

    x_train = scaler.transform(x_train)
    x_perturb = scaler.transform(x_perturb)

    # perturb each class mean to the target class
    mu_perturbed = perturb_mean_diff(x_train, y_train, args.target, classes)

    # save mean peturbations to dataframe
示例#11
0
def gen_user_cat():
    df = utils.load_enroll()
    X = utils.onehot_encode(df['username'])
    return {'X': X}
示例#12
0
def gen_course_cat():
    df = utils.load_enroll()
    X = utils.onehot_encode(df['course_id'])
    return {'X': X}
示例#13
0
    else:
        gene_sets = {"all_genes": df_test.columns}

    # select gene set
    try:
        name = args.set
        genes = gene_sets[name]
    except:
        print("gene set is not the subset file provided")
        sys.exit(1)

    # extract train/test data
    x_train = df_train[genes]
    x_test = df_test[genes]

    y_train = utils.onehot_encode(y_train, classes)
    y_test = utils.onehot_encode(y_test, classes)

    # normalize test data (using the train data)
    scaler = sklearn.preprocessing.MinMaxScaler()
    scaler.fit(x_train)

    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    # perturb each class mean to the target class
    mu_pert = perturb_mean_diff(x_test, y_test, args.target, classes)

    # save mean peturbations to dataframe
    df_pert = pd.DataFrame(data=mu_pert, index=genes, columns=classes)
示例#14
0
 def test_encoding(self):
     array = np.array([1, 4])
     num_classes = 5
     one_hot_matrix = utils.onehot_encode(array, num_classes)
     answer = np.array([[0, 1, 0, 0, 0], [0, 0, 0, 0, 1]])
     self.assertTrue(np.array_equal(one_hot_matrix, answer))