Exemplo n.º 1
0
    def my_data(self):
        # load training data
        train_images = util.np.load(
            os.path.join(self.args.data_dir, 'fmnist_train_data.npy'))
        train_labels = util.np.load(
            os.path.join(self.args.data_dir, 'fmnist_train_labels.npy'))
        train_labels = tf.Session().run(tf.one_hot(train_labels, 10))

        # normalize data
        train_images = train_images / 255

        # set up test set
        test_images, train_images = util.split_data(train_images,
                                                    self.test_set_size)
        test_labels, train_labels = util.split_data(train_labels,
                                                    self.test_set_size)

        # set up validation set
        train_images, train_labels = util.shuffler(train_images, train_labels)
        validation_images, train_images = util.split_data(
            train_images, self.validation_set_size)
        validation_labels, train_labels = util.split_data(
            train_labels, self.validation_set_size)

        data = {
            "train_images": train_images,
            "train_labels": train_labels,
            "test_images": test_images,
            "test_labels": test_labels,
            "validation_images": validation_images,
            "validation_labels": validation_labels
        }

        return data
Exemplo n.º 2
0
def trial_init(recdr, logr):
	logr.log('Initializing new trial...', 'standard')
	b = DataGenerator()
	b.set_baseline_response_prob(baseline)
	b.add_random_user_attrs(num_user_atts, min_user_att_levels, max_user_att_levels) 
	b.add_random_inter_attrs(num_msg_atts, min_msg_att_levels, max_msg_att_levels) 
	templates = b.set_random_propensities(num_propensity_groups, 
							  min_group_user_atts, max_group_user_atts, 
							  min_group_msg_atts, max_group_msg_atts,
							  min_group_pos_prob, max_group_pos_prob)
	# -> Returns: a pair (user templates, interaction templates)
	logr.log('Generating data...', 'standard')
	messages = b.gen_random_inters(num_test_messages)
	users = b.gen_random_users(num_users)
	#rows = ut.unzip(b.gen_crossprod_rows(b.unique_users(), messages))
	rows = ut.unzip(b.gen_random_rows_from(users, messages))
	logr.log('Number of rows: ' + str(len(rows)), 'standard')
	# Split data into train, calibration, and test.
	train, calibrate, test = ut.split_data(rows, 0.5, 0.25, 0.25)
	calibration_users = map(lambda (u, m, r): u, calibrate)
	test_users = map(lambda (u, m, r): u, test)
	controls = su.build_std_control_solvers(calibrate, b, messages, 15)
	treatments = su.build_std_knn_optims(train, calibrate, b, recorder, 1, 15)
	solvers = controls + treatments
	return (train, test_users, b, solvers)
def test_add_user_data(ident_service):
    # No transactions in the chain yet
    assert ident_service.last_transaction_hash == b'Base'

    # Create the data and signature
    data = json.dumps({'first_name': 'Bob', 'last_name': 'Smith'}).encode()
    signature = sign(user_1_private_key, data)
    user_data = UserData(data, signature)

    # Add the data to the service
    tx_hash = ident_service.add_user_data(user_data, user_1_cert_string)

    # Ensure the head of the chain is the new transaction
    assert tx_hash == ident_service.last_transaction_hash

    # Get the transaction and encryption key
    tx = ident_service.get_transaction(tx_hash)
    key = ident_service.get_key(tx_hash)

    # Ensure the transaction points to the old head of the chain
    assert tx.hash_pointer == b'Base'

    # Get the message and signature from the transaction
    decrypted = decrypt(key, tx.action.get_data())
    message, signature = split_data(decrypted)

    # Ensure the data has not been tampered with
    verify(user_1_cert, message, signature)

    # Ensure the data matches what the user uploaded
    assert message == data
    def share_data(self, user_data, user_cert, service_provider_cert):
        # get the latest transaction for the user
        latest_tx_hash = self.latest_tx_list[user_cert]
        latest_tx = self.transaction_pool[latest_tx_hash]
        key = self.keys[latest_tx_hash]

        # get the action from the transaction and decrypt the data
        decrypted = decrypt(key, latest_tx.action.get_data())
        message, _signature = split_data(decrypted)

        # check that the data to share matches the data on record
        is_consistent = check_data_consistency(json.loads(user_data.data),
                                               json.loads(message))

        # if it is not consistent, do not add the action to a transaction
        if not is_consistent:
            return None

        # if it is consistent, create a user share action
        new_share_action = UserDataShareAction(user_data,
                                               service_provider_cert)
        # create a transaction for the action
        transaction = Transaction(self.last_transaction_hash, new_share_action)
        # add the transaction to the chain and return the hash pointer
        return self.add_transaction_to_chain(transaction)
Exemplo n.º 5
0
def trial_init(recdr, logr):
    logr.log('Initializing new trial...', 'standard')
    b = DataGenerator()
    b.set_baseline_response_prob(baseline)
    b.add_random_user_attrs(num_user_atts, min_user_att_levels,
                            max_user_att_levels)
    b.add_random_inter_attrs(num_msg_atts, min_msg_att_levels,
                             max_msg_att_levels)
    templates = b.set_random_propensities(
        num_propensity_groups, min_group_user_atts, max_group_user_atts,
        min_group_msg_atts, max_group_msg_atts, min_group_pos_prob,
        max_group_pos_prob)
    # -> Returns: a pair (user templates, interaction templates)
    logr.log('Generating data...', 'standard')
    messages = b.gen_random_inters(num_test_messages)
    rows = ut.unzip(b.gen_crossprod_rows(b.unique_users(), messages))
    logr.log('Number of rows: ' + str(len(rows)), 'standard')
    # Split data into train, calibration, and test.
    train, calibrate, test = ut.split_data(rows, 0.5, 0.25, 0.25)
    calibration_users = map(lambda (u, m, r): u, calibrate)
    test_users = map(lambda (u, m, r): u, test)
    controls = su.build_std_control_solvers(calibrate, b, messages, 15)
    treatments = su.build_std_knn_optims(train, calibrate, b, recorder, 1, 15)
    solvers = controls + treatments
    return (train, test_users, b, solvers)
Exemplo n.º 6
0
def test_tree_classifier():
    """
    :return: None

    Function to test decision tree classifier
    """
    # X, Y = get_adult_data()
    # attr_types = [int for _ in range(X.shape[1])]

    data = load_breast_cancer()
    X = data.data
    Y = data.target.reshape(data.target.size)

    attr_types = [float for _ in range(X.shape[1])]

    Xtrain, Ytrain, Xtest, Ytest = split_data(X, Y, 0.8)
    model = ClassificationTree()
    print("Training..")
    model.train(Xtrain, Ytrain, attr_types)
    model.prune_tree(Xtrain, Ytrain)
    cY = model.predict(Xtest)
    print("Accuracy: {}".format(accuracy(Ytest, cY)))

    clf = tree.DecisionTreeClassifier()
    clf.fit(Xtrain, Ytrain.reshape(Ytrain.size))
    cY = clf.predict(Xtest)
    print("Scikit accuracy: ".format(accuracy(Ytest, cY)))
def train(data, args):

    dataset, T, x, b = data
    dim = dataset.shape[0]
    shard, interval = util.split_data(dataset)
    size = shard.shape[0]
    kv_x = mx.nd.zeros((dim, 1))
    kv_d = mx.nd.zeros((dim, 1))

    # create kvstore
    kvstore = util.create_kvstore(kv_x)

    A = dataset

    lambda_ = np.dot((x.T * A.T), (A * x))[0][0]
    gamma = args.learning_rate

    logging.info('Start training.')
    for epoch in range(args.epoch_num):
        # gradient in this epoch
        t1 = lambda_ / dim * x - b / dim
        nnz = A.getnnz(0).reshape((dim, 1))
        g = (np.multiply(nnz, t1) + t1 * dim + T * x) / dim

        start, end = 0, min(args.batch_size, size)
        while True:
            x_prime = x.copy()
            for i in range(start, end):
                if util.check_cancel(i, start, end):
                    logging.info('restart computation')
                    break
                # compute update
                u = lambda_ / dim * gamma * x - gamma * (
                    g - lambda_ / dim * x) - gamma * np.sum(
                        shard[i] * (x - x_prime)) * shard[i].T
                # update local vector
                x = x - u
                kv_d = kv_d - mx.nd.array(u.getA())

            # exchange with kvstore
            util.update_param(kvstore,
                              kv_d,
                              kv_x,
                              pull_only=util.need_restart())
            kv_d = mx.nd.zeros((dim, 1))
            x = kv_x.asnumpy()

            if not util.need_restart():
                start, end = end, min(end + args.batch_size, size)
                if start == end:
                    break
            util.reset_cancel()

        # compute objective
        loss = size / dim * np.dot((lambda_ / 2 * x - b).T, x)
        for i in range(*interval):
            loss -= (A[i] * x)**2 / 2
        logging.info('Epoch[{}] loss={}'.format(epoch, np.sum(loss) + 2))
Exemplo n.º 8
0
def main(data_file, vocab_path):
    """Build and evaluate Naive Bayes classifiers for the federalist papers"""

    authors, essays, essay_ids = parse_federalist_papers(data_file)

    function_words = load_function_words(vocab_path)
    # load the attributed essays into a feature matrix
    # label mapping is for me to track
    # make them into two classifiers, zero and one.
    # the distribution of  the zero (ham) was higher?
    # the distribution of one (man) was higher?
    # output: two classes zero and one

    X = load_features(essays, function_words)
    # TODO: load the author names into a vector y, mapped to 0 and 1, using functions from util.py

    labels_map = labels_to_key(authors)
    print(labels_map)
    # y output, a list of zeros and ones, 相对应,第几篇文章里面是什么
    # y is the golden standard, it is used for both training, and evaluation
    y = np.asarray(labels_to_y(authors, labels_map))
    # numerical
    print(f"Numpy array has shape {X.shape} and dtype {X.dtype}")

    # TODO shuffle, then split the data
    # if split has already had a shuffle function embedded in it, no need for importing
    train, test = split_data(X, y, 0.25)

    # TODO: train a multinomial NB model, evaluate on validation split
    nbm = MultinomialNB()
    # to see what is the definition of nbm, what it requires as in the parameter
    # train is array, two tuples with [] in it, the first one is a array, teh second one is target
    # rows of X and the len of y are not identical.
    # y 的长度要大于X, 不能直接用y, 需要用剪裁过在train 里面的
    nbm.fit(train[0], train[1])  # change
    preds_nbm = nbm.predict(test[0])
    test_y = test[1]
    accuracy = calculate_accuracy(preds_nbm, test_y)

    print(f" the accuracy for multinomial NB model is {accuracy}")

    # TODO: train a Bernoulli NB model, evaluate on validation split

    nbb = BernoulliNB()
    nbb.fit(train[0], train[1])
    preds_nbb = nbb.predict(test[0])
    accuracy = calculate_accuracy(preds_nbb, test_y)

    print(f" the accuracy for Bernoulli NB model is {accuracy}")

    # TODO: fit the zero rule
    train_y = train[1]
    most_frequent_class = find_zero_rule_class(train_y)
    print(f"the most frequent class is {most_frequent_class}")
    test_predictions = apply_zero_rule(test[0], most_frequent_class)
    test_accuracy = calculate_accuracy(test_predictions, test_y)
    print(f" the accuracy for the baseline is {test_accuracy}")
Exemplo n.º 9
0
def trial_init(recdr, logr):
	# Split data into train, calibration, and test.
	train, calibrate, test = ut.split_data(rows, 0.5, 0.25, 0.25)
	calibration_users = map(lambda (u, m, r): u, calibrate)
	test_users = map(lambda (u, m, r): u, test)
	controls = su.build_std_control_solvers(calibrate, b, 100, 15)
	treatments = su.build_std_knn_optims(train, calibrate, b, recorder, 1, 15)
	solvers = controls + treatments
	return (train, test_users, b, solvers)
Exemplo n.º 10
0
def trial_init(recdr, logr):
    # Split data into train, calibration, and test.
    train, calibrate, test = ut.split_data(rows, 0.5, 0.25, 0.25)
    calibration_users = map(lambda (u, m, r): u, calibrate)
    test_users = map(lambda (u, m, r): u, test)
    controls = su.build_std_control_solvers(calibrate, b, 100, 15)
    treatments = su.build_std_knn_optims(train, calibrate, b, recorder, 1, 15)
    solvers = controls + treatments
    return (train, test_users, b, solvers)
Exemplo n.º 11
0
    def my_data(self):
        # load training data
        train_images = util.np.load(
            os.path.join(self.args.data_dir, 'cifar_images.npy'))

        # normalize data
        train_images = train_images / 255

        # reshape to fit input tensor
        train_images = np.reshape(
            train_images,
            [-1, 32, 32, 3
             ])  # `-1` means "everything not otherwise accounted for"

        # load training labels
        train_labels = util.np.load(
            os.path.join(self.args.data_dir, 'cifar_labels.npy'))
        train_images, train_labels = util.shuffler(train_images, train_labels)

        # convert labels to one-hots
        train_labels = tf.Session().run(tf.one_hot(train_labels, 100))

        # set up test set
        test_images, train_images = util.split_data(train_images,
                                                    self.test_set_size)
        test_labels, train_labels = util.split_data(train_labels,
                                                    self.test_set_size)

        # set up validation set
        validation_images, train_images = util.split_data(
            train_images, self.validation_set_size)
        validation_labels, train_labels = util.split_data(
            train_labels, self.validation_set_size)

        data = {
            "train_images": train_images,
            "train_labels": train_labels,
            "test_images": test_images,
            "test_labels": test_labels,
            "validation_images": validation_images,
            "validation_labels": validation_labels
        }

        return data
Exemplo n.º 12
0
def gen_dataset(sentences, categories, max_words=78, train_test_split=True):
    ''' Generate a dataset of (input, output) pairs where the
        input is an embedded vector and output the category (one-hotted)

        Args
        ----
        sentences : list
                    list of sentences where each sentence is list of tokens
        max_words : integer
                    maximum number of words allowed in sentence
        train_test_split : boolean
                           whether to split data into 2 sets
    '''

    num_sentences = len(sentences)
    model = models.Word2Vec.load_word2vec_format(
        local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'),
        binary=True)
    vectorizer = lambda x: model[x] if x in model else np.zeros(300)
    encoder = one_hot_encoding(categories)

    X = np.zeros((num_sentences, max_words, 300))
    y = np.zeros((num_sentences, max_words, len(encoder.keys())))
    K = np.zeros(num_sentences)
    I = np.arange(num_sentences)

    param_dict = {}
    param_dict['max_words'] = max_words
    param_dict['encoder'] = encoder

    for sent_i in I:
        words = sentences[sent_i]
        cats = categories[sent_i]

        if sent_i % 1000 == 0:
            print("{} sentences parsed. {} remaining.".format(
                sent_i, num_sentences - sent_i - 1))

        X[sent_i, :, :], y[sent_i, :, :] = \
            prepare_sentence(words, categories=cats,
                                    vectorizer=vectorizer,
                                    encoder=encoder,
                                    max_words=max_words)

        K[sent_i] = len(words)  # keep track of num words in sentence

    if train_test_split:
        (X_train, X_test), (I_train, I_test) = util.split_data(X,
                                                               out_data=I,
                                                               frac=0.80)
        y_train, y_test = y[I_train], y[I_test]
        K_train, K_test = K[I_train], K[I_test]

        return (X_train, X_test), (y_train, y_test), (K_train,
                                                      K_test), param_dict
    return (X, y, K), param_dict
Exemplo n.º 13
0
def train_lstm(inputs,
               outputs,
               state_size,
               batch_size=256,
               param_scale=0.001,
               num_epochs=5,
               step_size=0.001):

    # split data (again) into a training and a validation set
    (tr_inputs, va_inputs), (tr_outputs, va_outputs) = util.split_data(
        inputs, out_data=outputs, frac=0.80)

    input_size = tr_inputs.shape[2]
    output_size = tr_outputs.shape[2]

    init_params = init_lstm_params(input_size,
                                   state_size,
                                   output_size,
                                   param_scale=param_scale,
                                   rs=npr.RandomState(0))

    num_batches = int(np.ceil(tr_inputs.shape[1] / batch_size))

    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx+1) * batch_size)

    # Define training objective
    def objective(params, iter):
        idx = batch_indices(iter)
        return -lstm_log_likelihood(
            params, tr_inputs[:, idx, :], tr_outputs[:, idx, :])

    # Get gradient of objective using autograd.
    objective_grad = grad(objective)

    print(
        "     Epoch     |    Train accuracy  |    Train log-like  |  Holdout accuracy  |  Holdout log-like  ")

    def print_perf(params, iter, gradient):
        train_acc = accuracy(params, tr_inputs, tr_outputs)
        train_ll = -lstm_log_likelihood(params, tr_inputs, tr_outputs)
        valid_acc = accuracy(params, va_inputs, va_outputs)
        valid_ll = -lstm_log_likelihood(params, va_inputs, va_outputs)
        print("{:15}|{:20}|{:20}|{:20}|{:20}".format(
            iter//num_batches, train_acc, train_ll, valid_acc, valid_ll))

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(objective_grad,
                            init_params,
                            step_size=step_size,
                            num_iters=num_epochs,
                            callback=print_perf)

    return optimized_params
Exemplo n.º 14
0
def gen_dataset(sentences, max_words=78, train_test_split=True):
    ''' Generate a dataset of (input, output) pairs where the
        input is an embedded vector and output is
        an embedded vector for the lemmatized form.

        Args
        ----
        sentences : list
                    list of sentences where each sentence is list of tokens
        max_words : integer
                    maximum number of words allowed in sentence
        train_test_split : boolean
                           whether to split data into 2 sets
    '''

    num_sentences = len(sentences)
    model = models.Word2Vec.load_word2vec_format(
        '../storage/pos_tagger/GoogleNews-vectors-negative300.bin',
        binary=True)
    vectorizer = lambda x: model[x] if x in model else np.ones(300
                                                               ) * ZERO_EPSILON
    lemmatizer = WordNetLemmatizer().lemmatize

    X = np.zeros((num_sentences, max_words, 300))
    y = np.zeros((num_sentences, max_words, 300))
    K = np.zeros(num_sentences)
    I = np.arange(num_sentences)

    param_dict = {}
    param_dict['max_words'] = max_words

    for sent_i, words in enumerate(sentences):
        if sent_i % 1000 == 0:
            print("{} sentences parsed. {} remaining.".format(
                sent_i, num_sentences - sent_i - 1))

        X[sent_i, :, :], y[sent_i, :, :] = \
            prepare_sentence(words,
                             vectorizer=vectorizer,
                             lemmatizer=lemmatizer,
                             max_words=max_words)

        K[sent_i] = len(words)  # keep track of num words in sentence

    if train_test_split:
        (X_train, X_test), (I_train, I_test) = split_data(X,
                                                          out_data=I,
                                                          frac=0.80)
        y_train, y_test = y[I_train], y[I_test]
        K_train, K_test = K[I_train], K[I_test]

        return (X_train, X_test), (y_train, y_test), (K_train,
                                                      K_test), param_dict
    return (X, y, K), param_dict
Exemplo n.º 15
0
def opcion_1(data):
    training, test = pre_proc(data, 70)
    entrada, salida = split_data(training, salida_columns)
    entrada_test, salida_test = split_data(test, salida_columns)

    topology = [entrada[0].size, 8, 4, salida[0].size]
    epochs = 500
    learning_rate = 0.1
    """Topology es una lista con la cantida de neuronas en cada capa
        act_f es la funcion de activacion que sera usada por cada capa """
    print('Creando red neuronal de topologia: ', topology)
    nn = neuronal_network.Network(topology, Sigmoid, MSE)
    print('Iniciando entrenamiento...')
    error = nn.train(entrada, salida, learning_rate, epochs)
    plot_loss(error, error_path)
    print('Entrenamiento finalizado.')

    opcion_4(nn, entrada_test, salida_test)

    return nn, entrada_test, salida_test
Exemplo n.º 16
0
def trainAndSaveModel(X_train,
                      y_train,
                      y_label_index,
                      max_iterations=7000,
                      folds=False):
    n_features = X_train.shape[1]
    n_classes = len(util.classes)

    avg_coefficients = np.zeros((n_classes, n_features))
    avg_intercepts = np.zeros(n_classes)

    data_kfold = util.split_data(y_index=y_label_index)
    train_accuracies = []
    eval_accuracies = []
    train_predictions = []
    eval_predictions = []

    for i, (X_train, y_train, X_val, y_val) in enumerate(data_kfold):
        print("Fold", i + 1)
        clf = LogisticRegression(max_iter=max_iterations,
                                 multi_class='multinomial',
                                 solver='newton-cg')
        clf.fit(X_train, y_train)
        train_acc = clf.score(X_train, y_train)
        train_accuracies.append(train_acc)
        eval_acc = clf.score(X_val, y_val)
        eval_accuracies.append(eval_acc)
        avg_coefficients += clf.coef_
        avg_intercepts += clf.intercept_
        train_predictions.append(clf.predict(X_train))
        eval_predictions.append(clf.predict(X_val))
        if folds:
            util.outputConfusionMatrix(
                clf.predict(X_train), y_train,
                "../figures/fold_" + str(i + 1) + "_train")
            util.outputConfusionMatrix(
                clf.predict(X_val), y_val,
                "../figures/fold_" + str(i + 1) + "_eval")

        print("train accuracy:", train_acc)
        print("eval accuracy:", eval_acc)

    avg_coefficients /= util.K
    avg_intercepts /= util.K
    model = {
        "coeff_": avg_coefficients,
        "intercept_": avg_intercepts,
        "train_accuracies": train_accuracies,
        "eval_accuracies": eval_accuracies,
        "train_predictions": train_predictions,
        "eval_predictions": eval_predictions
    }

    util.dumpVar("../models/avg_logistic_model", model)
Exemplo n.º 17
0
def execute(data, training_data_ratio=2.0 / 3.0, k=1):
    """
    Execute the "Locally-Weighted" Linear Regression (using Closed-Form Linear Regression)
    :param data: Raw Data frame parsed from CSV
    :param training_data_ratio: The percent (0.0 to 1.0) of input data to use in training.
    :param k: Smoothing parameter for local weight computation
    :return: Nothing
    """
    # 2. Randomize the data
    randomized_data = util.randomize_data(data)

    # 3. Select the first 2 / 3(round up) of the data for training and the remaining for testing
    training_data, test_data = util.split_data(randomized_data,
                                               training_data_ratio)
    training_outputs = util.get_output(training_data)

    # 4. Standardize the data(except for the last column of course) using the training data
    standardized_training_data, mean, std = util.standardize_data(
        util.get_features(training_data))

    # Add offset column at the front
    standardized_training_data.insert(0, "Bias", 1)

    std_test_data, _, _ = util.standardize_data(util.get_features(test_data),
                                                mean, std)
    std_test_data.insert(0, "Bias", 1)

    squared_errors = []
    # 5. Then for each testing sample
    for i in xrange(0, len(std_test_data)):

        testing_sample = std_test_data.iloc[i]
        expected_output = test_data.loc[testing_sample.name][-1]

        theta_query = compute_theta_query(testing_sample,
                                          standardized_training_data,
                                          training_outputs, k)

        # (b) Evaluate the testing sample using the local model.
        actual_output = np.dot(testing_sample, theta_query)

        # (c) Compute the squared error of the testing sample.
        squared_errors.append(util.compute_se(expected_output, actual_output))

    # 6. Compute the root mean squared error (RMSE)
    sum_of_squared_errors = 0
    for error in squared_errors:
        sum_of_squared_errors += error

    mean_squared_error = sum_of_squared_errors / len(squared_errors)

    rmse = math.sqrt(mean_squared_error)

    return rmse
def test_share_data(ident_service):
    # Create the user data
    data = json.dumps({'first_name': 'Bob', 'last_name': 'Smith'}).encode()
    signature = sign(user_1_private_key, data)
    user_data = UserData(data, signature)

    # Add the user's data to the service
    tx_hash = ident_service.add_user_data(user_data, user_1_cert_string)

    # Create data to share with the service provider
    shared_data = json.dumps({'first_name': 'Bob'}).encode()
    signature = sign(user_1_private_key, shared_data)
    shared_user_data = UserData(shared_data, signature)

    # Add the shared data to the identity service
    shared_tx_hash = ident_service.share_data(shared_user_data,
                                              user_1_cert_string, sp_1_cert)

    # Ensure the head of the chain is the new transaction
    assert ident_service.last_transaction_hash == shared_tx_hash

    # Get the share transaction
    share_tx = ident_service.get_transaction(shared_tx_hash)

    # Ensure the share transaction points to the previous head of the chain
    assert tx_hash == share_tx.hash_pointer

    # As the service provider, get the encryption key and decrypt the data
    encrypted_encryption_key, encrypted_data = split_data(
        share_tx.action.get_data())
    decrypted_encryption_key = decrypt_private(sp_1_private_key,
                                               encrypted_encryption_key)
    share_decrypted = decrypt(decrypted_encryption_key, encrypted_data)
    share_message, share_signature = split_data(share_decrypted)

    # Verify the data is signed by the user and hasn't been tampered with
    verify(user_1_cert, share_message, share_signature)

    # Ensure the data matches what the user uploaded to the service
    assert share_message == shared_data
Exemplo n.º 19
0
def execute(dataframe, training_data_ratio=2.0 / 3):
    """
    Execute Multi-class SVM
    :param dataframe: The input dataset containing the classifier as the last column
    :param training_data_ratio: The percentage of data to use for training (default: 2/3)
    :return: A list of metrics on performance for the one-vs-many, and the accuracy of one-vs-one SVM
    """

    # Seed our randomizer to ensure we get repeatable results
    random.seed(0)

    # 2. Randomizes the data.
    randomized_data = util.randomize_data(dataframe)

    # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing
    training_data, test_data = util.split_data(randomized_data,
                                               training_data_ratio)

    # 4. Standardizes the data (except for the last column of course) using the training data
    training_features, training_targets = util.split_features_target(
        training_data)
    std_training_features, mean, std = util.standardize_data(training_features)

    # Due to the standard deviation being zero, we end up with NaN entries, reset them to zero
    std_training_features.fillna(0, inplace=True)

    test_features, test_targets = util.split_features_target(test_data)
    std_test_features, _, _ = util.standardize_data(test_features, mean, std)

    # Due to the standard deviation being zero, we end up with NaN entries, reset them to zero
    std_test_features.fillna(0, inplace=True)

    target_classes = training_targets.unique()

    # 5. First trains and evaluates using a One vs All approach:
    one_vs_many_metrics = execute_one_vs_many(std_test_features,
                                              std_training_features,
                                              target_classes, test_targets,
                                              training_targets)

    # 6. Trains and evaluates using a One vs One approach:
    num_classified_incorrectly = execute_one_vs_one(std_test_features,
                                                    std_training_features,
                                                    target_classes,
                                                    test_targets,
                                                    training_targets)

    num_classified_correctly = len(test_features) - num_classified_incorrectly
    one_vs_one_accuracy = num_classified_correctly / float(len(test_features))

    return one_vs_many_metrics, one_vs_one_accuracy
Exemplo n.º 20
0
def load_data(data_file, test_ratio_offset):
    with open(data_file, 'r') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        x = []
        y = []

        for row in csvreader:
            for i in range(0, len(row) - 1):
                if row[1 + i] == '?':
                    row[1 + i] = 'a'

            x_conv = list(map(float, map(ord, row[1:])))
            y_conv = [0. if row[0] == 'p' else 1.]
            # Remove question marks
            for u in x_conv:
                if u == 63.:
                    u = -1.
            x.append(x_conv)
            y.append(y_conv)

        split_ratio = 0.9

        x, y = util.shuffle_data(x, y)
        x_train, x_test, y_train, y_test = util.split_data(x, y, split_ratio)

        # Check that we can create a confusion matrix
        # 	(have at least 1 positive and negative sample in test set)

        while (len([result for result in y_test if result[0] == 0.]) <
               (0.5 - test_ratio_offset) * len(y_test)
               or len([result for result in y_test if result[0] == 1.]) <
               (0.5 - test_ratio_offset) * len(y_test)):
            x_train, x_test, y_train, y_test = util.split_data(
                x, y, split_ratio)

        return x_train, x_test, y_train, y_test
    print('[ERR] Failed to load data from file \'{0}\''.format(data_file))
    exit()
Exemplo n.º 21
0
def create_features_labels(save_to_disk=False):
    ''' Use pipeline to generate the (input, output)
        pairs for machine learning

        Args
        ----
        save_to_disk : boolean
                       write frequencies to a pickle file

    '''

    lexicon = ctd.load_data(ctd.brown_generator(), return_sent_labels=True)

    # not efficient --> change me
    data = np.array([[t, l] for t, l in lexicon])
    tokens = data[:, 0]
    labels = data[:, 1]

    # get features
    darrays = get_descriptor_arrays(tokens)
    labels = get_dummies(labels)

    # put into grams (give context)
    didx, darrays, labels = make_grams(darrays,
                                       3,
                                       labels=labels,
                                       target_tag=EOS_PUNC)

    (tr_inputs, te_inputs), (tr_outputs,
                             te_outputs) = split_data(darrays,
                                                      out_data=labels,
                                                      frac=0.80)

    if save_to_disk:
        np.save(
            local_ref('../storage/sentence_disambiguation/X_train.npy',
                      tr_inputs))
        np.save(
            local_ref('../storage/sentence_disambiguation/X_test.npy',
                      te_inputs))
        np.save(
            local_ref('../storage/sentence_disambiguation/y_train.npy',
                      tr_outputs))
        np.save(
            local_ref('../storage/sentence_disambiguation/y_test.npy',
                      te_outputs))

    return (tr_inputs, te_inputs), (tr_outputs, te_outputs)
Exemplo n.º 22
0
    def execute(self, dataframe):
        """
        Execute the Binary-Artificial Neural Network problem
        :param dataframe: Input raw data
        :return: (final test error, list of training errors for each training iteration)
        """

        # 2. Randomizes the data.
        print "Randomizing Data"
        random_data = util.randomize_data(dataframe)

        # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing
        print "Splitting Test and Training Data"
        training_data, test_data = util.split_data(random_data,
                                                   self._training_data_ratio)

        # 4. Standardizes the data (except for the last column of course as well as the bias feature)
        #    using the training data
        print "Standardizing Training Data"
        standardized_training_data, mean, std = util.standardize_data(
            self.__select_features(training_data))

        # 5. Trains an artificial neural network using the training data
        #    Our last column is the label column
        # 6. During the training process, compute the training error after each iteration.
        #    You will use this to plot the training error vs. iteration number.
        expected_training_outputs = self.__select_target_labels(
            training_data).values.reshape(-1, 1)
        print "Training Neural Network"
        training_errors = self._network.train_binary(
            standardized_training_data, expected_training_outputs,
            self._iterations)

        # 7. Classifies the testing data using the trained neural network.
        print "Classifying Testing Data"
        expected_test_output = self.__select_target_labels(test_data)
        std_test_data, _, _ = util.standardize_data(
            self.__select_features(test_data), mean, std)

        actual_test_output = self._network.evaluate(std_test_data.values)

        # 8. Compute the testing error.
        print "Computing Metrics"
        self.__update_metrics(expected_test_output, actual_test_output)
        test_error = self._metrics.calculate_error()
        print "Test Error: ", test_error

        return test_error, training_errors
Exemplo n.º 23
0
def train_nn(
        inputs, outputs, num_hiddens,  # don't include inputs and outputs
        batch_size=256, param_scale=0.1,
        num_epochs=5, step_size=0.001, L2_reg=1.0):

    # split data (again) into a training and a validation set
    (tr_inputs, va_inputs), (tr_outputs, va_outputs) = util.split_data(
        inputs, out_data=outputs, frac=0.80)

    num_input_dims = tr_inputs.shape[1]
    num_output_dims = tr_outputs.shape[1]
    layer_sizes = [num_input_dims] + num_hiddens + [num_output_dims]
    init_params = init_random_params(param_scale, layer_sizes)
    num_batches = int(np.ceil(tr_inputs.shape[0] / batch_size))

    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx+1) * batch_size)

    # Define training objective
    def objective(params, iter):
        idx = batch_indices(iter)
        return -log_posterior(
            params, tr_inputs[idx], tr_outputs[idx], L2_reg)

    # Get gradient of objective using autograd.
    objective_grad = grad(objective)

    print(
        "     Epoch     |    Train accuracy  |    Train log-like  |  Holdout accuracy  |  Holdout log-like  ")

    def print_perf(params, iter, gradient):
        if iter % num_batches == 0:
            train_acc = accuracy(params, tr_inputs, tr_outputs)
            train_ll = log_posterior(params, tr_inputs, tr_outputs, L2_reg)
            valid_acc = accuracy(params, va_inputs, va_outputs)
            valid_ll = log_posterior(params, va_inputs, va_outputs, L2_reg)
            print("{:15}|{:20}|{:20}|{:20}|{:20}".format(
                iter//num_batches, train_acc, train_ll, valid_acc, valid_ll))

    # The optimizers provided can optimize lists, tuples, or dicts of
    # parameters.
    optimized_params = adam(
        objective_grad, init_params, step_size=step_size,
        num_iters=num_epochs * num_batches, callback=print_perf)

    return optimized_params
Exemplo n.º 24
0
def main(data_file):
    print(data_file)

    # load the data
    authors, essays, essay_ids = parse_federalist_papers(data_file)
    num_essays = len(essays)
    print(f"Working with {num_essays} reviews")

    # create a key that links author id string -> integer
    author_key = labels_to_key(authors)
    print(len(author_key))
    print(author_key)

    # convert all the labels using the key
    y = labels_to_y(authors, author_key)
    assert y.size == len(
        authors
    ), f"Size of label array (y.size) must equal number of labels {len(authors)}"

    # shuffle and split the data
    train, test = split_data(essays, y, 0.3)
    data_size_after = len(train[1]) + len(test[1])

    assert data_size_after == y.size, f"Number of datapoints after split {data_size_after} must match size before {y.size}"
    print(f"{len(train[0])} in train; {len(test[0])} in test")

    # learn zero rule on train
    train_y = train[1]
    most_frequent_class = find_zero_rule_class(train_y)

    print(most_frequent_class)

    # lookup label string from class #
    reverse_author_key = {v: k for k, v in author_key.items()}
    print(
        f"The most frequent class is {reverse_author_key[most_frequent_class]}"
    )

    # apply zero rule to test reviews
    test_predictions = apply_zero_rule(test[0], most_frequent_class)
    print(f"Zero rule predictions on held-out data: {test_predictions}")

    # score accuracy
    test_y = test[1]
    test_accuracy = calculate_accuracy(test_predictions, test_y)
    print(f"Accuracy of zero rule: {test_accuracy:0.03f}")
Exemplo n.º 25
0
    def __split_dataset(self, df):
        if self.sintetic:
            dir_src = const.DIR_SINTETIC_DATASET
            file_training_dst = const.SINTETIC_FILE_TRAINING
            file_test_dst = const.SINTETIC_FILE_TEST
            file_cv_dst = const.SINTETIC_FILE_CV
        else:
            dir_src = const.DIR_DATASET
            file_training_dst = const.FILE_TRAINING
            file_test_dst = const.FILE_TEST
            file_cv_dst = const.FILE_CV

        training, cv, test = util.split_data(df,
                                             train_perc=const.TRAINING_PERC,
                                             cv_perc=const.CV_PERC,
                                             test_perc=const.TEST_PERC)
        training.to_csv(dir_src + '/' + file_training_dst, index=False)
        test.to_csv(dir_src + '/' + file_test_dst, index=False)
        cv.to_csv(dir_src + '/' + file_cv_dst, index=False)
Exemplo n.º 26
0
def load_data(data_file):
	with open(data_file, 'r') as csvfile:
		csvreader = csv.reader(csvfile, delimiter=',')
		x = []
		y = []

		for row in csvreader:
			x_conv = (list(map(float, row[:-1])))
			y_conv = ([float(row[-1])])
			x.append(x_conv)
			y.append(y_conv)
			#print(x_conv)
			#print(y_conv)
		#y=relabel(y)
		x, y = util.shuffle_data(x, y)
		x_train, x_test, y_train, y_test = util.split_data(x, y, 0.9)

		return x_train, x_test, y_train, y_test
	print('[ERR] Failed to load data from file \'{0}\''.format(data_file))
	exit()
Exemplo n.º 27
0
def execute(data):
    """

    :param data: Raw Data frame parsed from CSV
    :return: Nothing
    """

    # 2. Randomizes the data
    randomized_data = util.randomize_data(data)

    # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing
    training_data_size = 2.0 / 3.0
    training_data, test_data = util.split_data(randomized_data,
                                               training_data_size)

    # Capture the predicted outputs
    training_outputs = training_data[training_data.columns[-1]]

    # 4. Standardizes the data (except for the last column of course) using the training data
    training_inputs, training_mean, training_std = util.standardize_data(
        util.get_features(training_data))

    # Add offset column at the front
    training_inputs.insert(0, "Bias", 1)

    # 5. Computes the closed-form solution of linear regression
    weights = find_weights(training_inputs, training_outputs)

    # 6. Applies the solution to the testing samples
    test_input = util.get_features(test_data)
    expected = util.get_output(test_data)
    actual = apply_solution(test_input, training_mean, training_std, weights)

    # 7. Computes the root mean squared error (RMSE)
    rmse = util.compute_rmse(expected, actual)

    return weights, rmse
Exemplo n.º 28
0
b.set_user_inter_propensity(ut1, mt1, 0.5)
b.set_user_inter_propensity(ut2, mt2, 0.5)
b.set_user_inter_propensity(ut3, mt3, 0.5)
b.set_user_inter_propensity(ut4, mt4, 0.99)
b.set_user_inter_propensity(ut5, mt5, 0.5)

rows = []
rows += ut.unzip(b.gen_random_rows_from_template(ut1, mt1, 100))
rows += ut.unzip(b.gen_random_rows_from_template(ut2, mt2, 100))
rows += ut.unzip(b.gen_random_rows_from_template(ut3, mt3, 100))
rows += ut.unzip(b.gen_random_rows_from_template(ut4, mt4, 100))
rows += ut.unzip(b.gen_random_rows_from_template(ut5, mt5, 100))
rows += ut.unzip(b.gen_random_rows(2000))


train, test = ut.split_data(rows, 0.7, 0.3)
test_users = map(lambda (u, m, r): u, test)

op = KNNOptimizer()
op.set_data_rows(train)
op.set_similarity_f(match_count)

best_msgs = su.n_best_messages(test_users, b, 100, 15)
msgs = su.n_best_messages(test_users, b, 100, 100)
ctrl_1 = lambda u: best_msgs[0]
ctrl_2 = lambda u: rd.sample(msgs, 1)[0]
ctrl_3 = lambda u: rd.sample(best_msgs, 1)[0]

asf_1 = build_weighted_mode_selector(lambda x: 1)
asf_2 = build_weighted_mode_selector(lambda x: 10**x)
f_3_1 = lambda u: op.optimize(u, 3, asf_1)
Exemplo n.º 29
0
import sys
from util import (
    subsample_raw_data,
    split_data,
    get_feat_dict,
)

if __name__ == '__main__':
    if not os.path.isdir('raw_data') or \
     not os.path.exists('raw_data/train.txt') or \
     not os.path.exists('raw_data/test.txt'):
        print("Please put raw data in '/raw_data'/train.txt and '/raw_data'/test.txt")
    if not os.path.isdir('train_data'):
        os.mkdir('train_data')
    if not os.path.isdir('test_data'):
        os.mkdir('test_data')
    if not os.path.isdir('aid_data'):
        os.mkdir('aid_data')

    data_size = 100000
    if os.path.exists('subsampled_raw_data/subsampled_train_' + str(data_size) + ".txt"):
        os.remove('subsampled_raw_data/subsampled_train_' + str(data_size) + ".txt") 
    if os.path.exists('train_data/train.txt'):
        os.remove('train_data/train.txt') 
    if os.path.exists('test_data/test.txt'):
        os.remove('test_data/test.txt') 

    subsample_raw_data(data_size)    
    split_data(data_size)
    get_feat_dict()
    print('Done!')
Exemplo n.º 30
0
def train_network(sess, x, y, cfg):
    # Alias our training config to reduce code
    t_cfg = cfg['nn']

    # Alias config vars to reduce code
    neurons = t_cfg['parameters']['neurons']
    epochs = t_cfg['parameters']['epochs']
    learning_rate = t_cfg['parameters']['learning_rate']
    err_thresh = t_cfg['error_threshold']
    model_dir = t_cfg['model_dir']
    avg_factor = t_cfg['avg_factor']
    save_epoch = t_cfg['save_epoch']
    valid_thresh = t_cfg['valid_threshold']

    print(
        '[ANN] \tTraining parameters: epochs={0}, learning_rate={1:.2f}, neurons={2}'
        .format(epochs, learning_rate, neurons))

    # Create validation set
    x_train, x_valid, y_train, y_valid = util.split_data(x, y, 0.9)
    x_valid, y_valid = util.shuffle_data(x_valid, y_valid)

    # Create placeholders for tensors
    x_ = tf.placeholder(tf.float32, [None, 22], name='x_placeholder')
    y_ = tf.placeholder(tf.float32, [None, 1], name='y_placeholder')

    # Generate new random weights for new network
    weights = {
        'fc1': tf.Variable(tf.random_normal([22, neurons]), name='w_fc1'),
        'fc2': tf.Variable(tf.random_normal([neurons, neurons]), name='w_fc2'),
        'fc3': tf.Variable(tf.random_normal([neurons, 1]), name='w_fc3'),
    }

    # Generate new random biases for new network
    biases = {
        'fc1': tf.Variable(tf.random_normal([neurons]), name='b_fc1'),
        'fc2': tf.Variable(tf.random_normal([neurons]), name='b_fc2'),
        'fc3': tf.Variable(tf.random_normal([1]), name='b_fc3'),
    }

    # Construct our network and return the last layer to output the result
    final_layer = construct_network(x_, weights, biases, neurons)

    # Define error function
    cost_train = tf.reduce_mean(
        tf.losses.mean_squared_error(labels=y_, predictions=final_layer))
    cost_valid = tf.reduce_mean(
        tf.losses.mean_squared_error(labels=y_, predictions=final_layer))

    # Define optimiser and minimise error function task
    optimiser_train = tf.train.GradientDescentOptimizer(
        learning_rate=learning_rate).minimize(cost_train)
    optimiser_valid = tf.train.GradientDescentOptimizer(
        learning_rate=learning_rate).minimize(cost_valid)

    # Initialise global variables of the session
    sess.run(tf.global_variables_initializer())

    # Create error logging storage
    train_errors = []
    valid_errors = []

    # Setup our continous plot
    fig = plt.figure()
    plt.title('Error vs Epoch')
    plt.plot(train_errors[:epochs], color='r', label='training')
    plt.plot(valid_errors[:epochs], color='b', label='validation')
    plt.xlabel('Epoch')
    plt.ylabel('Error')
    plt.legend()
    plt.grid()
    plt.ion()
    plt.show()

    # Measure training time
    t_start = time.time()

    diff_err = 1.
    vel_err = 0.
    acc_err = 0.

    # Generate a new random model name for new network model
    model_name = ''.join(
        random.choice(string.ascii_lowercase + string.digits)
        for _ in range(4))

    for i in range(epochs):
        # Run network on training and validation sets
        _, train_error = sess.run([optimiser_train, cost_train],
                                  feed_dict={
                                      x_: x_train,
                                      y_: y_train
                                  })
        _, valid_error = sess.run([optimiser_train, cost_train],
                                  feed_dict={
                                      x_: x_valid,
                                      y_: y_valid
                                  })

        # If we're at a save epoch, save!
        if i % save_epoch == 0:
            model = util.save_model(
                sess, weights, biases, neurons, train_errors,
                os.path.join(model_dir, model_name + "_model"))

        # Add new errors to list
        train_errors.append(train_error)
        valid_errors.append(valid_error)

        # If we have at least an averageable amount of samples
        if i > avg_factor:
            avg_train_error = 0
            avg_valid_error = 0
            # Get sum over last n epochs
            for j in range(0, avg_factor):
                avg_train_error += train_errors[i - j]
                avg_valid_error += valid_errors[i - j]
            # Average them
            avg_train_error /= avg_factor
            avg_valid_error /= avg_factor

            # Calculate change in velocity of error difference
            acc_err = vel_err - (diff_err -
                                 abs(avg_valid_error - avg_train_error))

            # Calculate change in error difference (positive -> convergence, negative -> divergence)
            vel_err = diff_err - abs(avg_valid_error - avg_train_error)

            # Calculate error difference between validation and training
            diff_err = abs(avg_valid_error - avg_train_error)
            # print('[ANN] Epoch: {0:4d}, Δerr = {1:7.4f}, 𝛿(Δerr) = {2:7.4f}, 𝛿(𝛿(Δerr)) = {3:7.4f}'.format(i, diff_err, vel_err, acc_err)) # DEBUG

        # If we already have our target error, terminate early
        if train_error <= err_thresh or (diff_err > valid_thresh
                                         and vel_err < 0.):
            break

        # Set plot settings
        if i > 0:
            plt.plot(train_errors[:epochs], color='r', label='training')
            plt.plot(valid_errors[:epochs], color='b', label='validation')
            plt.axis([0, i, 0., 1.])
            plt.draw()
            plt.pause(0.001)

    plt.ioff()

    t_elapsed = time.time() - t_start

    # Calculate new simple accuracy from final error
    accuracy = 1 - train_error

    # Save model to file
    model = util.save_model(sess, weights, biases, neurons, train_errors,
                            os.path.join(model_dir, model_name + "_model"))

    print('\n[ANN] Training Completed:')

    # Calculate number of minutes, seconds and milliseconds elapsed
    t_m = t_elapsed / 60
    t_s = t_elapsed % 60
    t_ms = (t_s % 1) * 1000

    print('[ANN]\tModel name: {0}'.format(model_name))
    print('[ANN]\tSimple model accuracy: {0:.3f}%'.format(accuracy * 100))
    print('[ANN]\tTime elapsed: {0:2d}m {1:2d}s {2:3d}ms'.format(
        int(t_m), int(t_s), int(t_ms)))

    return model, model_name, {
        'num_layers': len(weights),
        'layer_width': neurons,
        'learning_rate': learning_rate,
        'time_to_train': t_elapsed,
        'train_errors': [float(i) for i in train_errors],
        'valid_errors': [float(i) for i in valid_errors]
    }
Exemplo n.º 31
0
def get_bank_data():
    features, resp = preprocess_bank_data()
    return split_data(features, resp)
import tensorflow as tf
import pandas as pd
from random import randint
from model import train_function
from util import split_data

# load data
images=np.load('/work/cse496dl/shared/homework/01/fmnist_train_data.npy')
labels=np.load('/work/cse496dl/shared/homework/01/fmnist_train_labels.npy')

#one hot encode labels
labels_oh = np.zeros((labels.astype(int).size, labels.astype(int).max()+1))
labels_oh[np.arange(labels.size),labels.astype(int)] = 1

# split into train and test
train_images, val_images, test_images = split_data(images, 0.7, 0.1, .2, 123)
train_labels, val_labels, test_labels = split_data(labels_oh, 0.7, 0.1, .2, 123)



#variables specification
filepath='/work/cse496dl/dmle/'
c=0
hiddenlayers=[5]
batchsize=[128]
learningrate=[0.001]
regularization=[tf.contrib.layers.l2_regularizer(scale=0.01)]
results=pd.DataFrame()
for h in hiddenlayers:
    for b in batchsize:
        for l in learningrate:
Exemplo n.º 33
0
import numpy as np
from sklearn.tree import DecisionTreeRegressor

import util

# Loading and Cleaning

x, y = util.load_inputs_and_outputs("data/ENB2012_data.csv")

x_train, x_test, y_train, y_test = util.split_data(x, y)

# Feature selection and Visualisation

util.visualise(x_train, y_train)
util.spearman(x_train, y_train)

# Remove X8 from features as discussed in report
x_train = np.delete(x_train, 7, 1)
x_test = np.delete(x_test, 7, 1)

# Training and optimisation
util.plot_depth_accuracy(x_train, y_train)

# Evaluation
y1_model = DecisionTreeRegressor(max_depth=6, random_state=42)
y1_model.fit(x_train, y_train[:, 0])

y2_model = DecisionTreeRegressor(max_depth=6, random_state=42)
y2_model.fit(x_train, y_train[:, 1])

y1_test = y_test[:, 0]
Exemplo n.º 34
0
def execute(data, training_data_ratio=2.0 / 3):
    """
    Execute the Naive Bayes classification
    :param data: Dataframe containing training and test data
    :param training_data_ratio:
    :return:
    """

    spam_class_name = 1
    not_spam_class_name = 0

    # 2. Randomize the data.
    print "Randomizing Data"
    randomized_data = util.randomize_data(data)

    # 3. Split the data in for training and testing
    print "Splitting Data for Test and Training"
    training_data, test_data = util.split_data(randomized_data, training_data_ratio)

    # 4. Standardize Training Data (except for class labels)
    print "Standardizing Training Data"
    training_features, training_data_target = util.split_features_target(training_data)
    std_training_features, mean, std = util.standardize_data(training_features)

    # 5. Divides the training data into two groups: Spam samples, Non-Spam samples.
    target_groups = training_data_target.groupby(training_data_target)

    total_training_size = float(len(training_data))

    print "Computing probability of priors"
    data_class_probability = {class_name: len(target_group) / total_training_size
                              for (class_name, target_group) in target_groups}

    # 6. Creates Normal models for each feature for each class.
    print "Creating normal models for each feature, for each class"
    models = {}
    for class_name, target_group in target_groups:
        models[class_name] = {}
        for feature_name in training_features.columns:
            dataset = std_training_features.loc[target_group.index][feature_name]
            feature_mean = dataset.mean()
            feature_std = dataset.std()
            models[class_name][feature_name] = {"mean":feature_mean, "standard_deviation": feature_std}

    # 7. Classify each testing sample using these models and choosing the class label based
    #    on which class probability is higher.
    print "Evaluating models for each test data point"
    test_features, test_targets = util.split_features_target(test_data)
    std_test_features, _, _ = util.standardize_data(test_features, mean, std)

    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    for i in xrange(len(std_test_features)):
        probability_per_class = compute_posterior(models, data_class_probability, std_test_features.iloc[i])

        # Select the class label of the class with highest probability
        assigned_class = max(probability_per_class.iteritems(), key=operator.itemgetter(1))[0]
        expected_class = test_targets.iloc[i]

        # Tally up each of our counters for performance measurements
        if expected_class == spam_class_name:
            if assigned_class == spam_class_name:
                true_positives += 1
            else: # assigned_class == not_spam_class_name
                false_negatives += 1
        else: # expected_class == not_spam_class_name
            if assigned_class == not_spam_class_name:
                true_negatives += 1
            else: # assigned_class == spam_class_name
                false_positives += 1

    # 8. Computes the statistics using the testing data results
    metrics = BinaryClassifierMetric(true_positives, false_positives, true_negatives, false_negatives)

    return metrics
Exemplo n.º 35
0
mt5 = {'IA_2':'L_4', 'IA_4':'L_3'}

b.set_user_inter_propensity(ut1, mt1, 0.5)
b.set_user_inter_propensity(ut2, mt2, 0.5)
b.set_user_inter_propensity(ut3, mt3, 0.5)
b.set_user_inter_propensity(ut4, mt4, 0.99)
b.set_user_inter_propensity(ut5, mt5, 0.5)

rows = []
rows += ut.unzip(b.gen_random_rows_from_template(ut1, mt1, 100))
rows += ut.unzip(b.gen_random_rows_from_template(ut2, mt2, 100))
rows += ut.unzip(b.gen_random_rows_from_template(ut3, mt3, 100))
rows += ut.unzip(b.gen_random_rows_from_template(ut4, mt4, 100))
rows += ut.unzip(b.gen_random_rows_from_template(ut5, mt5, 100))
rows += ut.unzip(b.gen_random_rows(2000))

log = su.BasicLogger()
recorder = su.ScenarioRecorder()

# Split data into train, calibration, and test.
train, calibrate, test = ut.split_data(rows, 0.5, 0.25, 0.25)
calibration_users = map(lambda (u, m, r): u, calibrate)
test_users = map(lambda (u, m, r): u, test)

controls = su.build_std_control_solvers(calibrate, b, 100, 15)
treatments = su.build_std_knn_optims(train, calibrate, b, recorder, 1, 15)

solvers = controls + treatments
		   
su.execute_trial(train, test_users, b, solvers, recorder, logger = log)
Exemplo n.º 36
0
b.set_user_inter_propensity(ut1, mt1, 0.5)
b.set_user_inter_propensity(ut2, mt2, 0.5)
b.set_user_inter_propensity(ut3, mt3, 0.5)
b.set_user_inter_propensity(ut4, mt4, 0.99)
b.set_user_inter_propensity(ut5, mt5, 0.5)

rows = []
rows += ut.unzip(b.gen_random_rows_from_template(ut1, mt1, 100))
rows += ut.unzip(b.gen_random_rows_from_template(ut2, mt2, 100))
rows += ut.unzip(b.gen_random_rows_from_template(ut3, mt3, 100))
rows += ut.unzip(b.gen_random_rows_from_template(ut4, mt4, 100))
rows += ut.unzip(b.gen_random_rows_from_template(ut5, mt5, 100))
rows += ut.unzip(b.gen_random_rows(1500))


train, test = ut.split_data(rows, 0.95, 0.05)
test_users = map(lambda (u, m, r): u, test)

op = KNNOptimizer()
op.set_data_rows(train)
op.set_distance_f(hamming)

best_msgs = su.n_best_messages(test_users, b, 100, 15)
msgs = su.n_best_messages(test_users, b, 100, 100)
ctrl_1 = lambda u: best_msgs[0]
ctrl_2 = lambda u: rd.sample(msgs, 1)[0]
ctrl_3 = lambda u: rd.sample(best_msgs, 1)[0]
knn_k3_f1 = lambda u: op.optimize(u, 3, op.f1)
knn_k6_f1 = lambda u: op.optimize(u, 6, op.f1)
knn_k9_f1 = lambda u: op.optimize(u, 500, op.f1)
knn_k3_f2 = lambda u: op.optimize(u, 3, op.f2)