Пример #1
0
    def set_data(self, train, test, batch_size=None, maxlen=200, \
        weight_dir = None, data_name=None, num_classes=6):
        self.train_files = train
        self.test_files = test
        if batch_size:
            self.batch_size = batch_size

        if data_name == "vac_data":
            self.train_data, self.train_labels, self.test_data, \
                self.test_labels, \
                self.test_sentences, \
                self.labels, \
                self.embedding_matrix, \
                self.tokenizer = prepare_vaccine_data( \
                train = self.train_files, \
                test  = self.test_files)
        else:
            self.train_data, self.train_labels, self.test_data, \
                self.test_sentences, \
                self.labels, \
                self.embedding_matrix, \
                self.tokenizer = prepare_data( \
                train = self.train_files, \
                test  = self.test_files)

            self.data_iden = data_name

        if weight_dir:
            self.weight_dir = weight_dir

        # save in dir
        self.colnames = ['text'] + self.labels

        self.model = self.define(weight_dir, num_classes=num_classes)
Пример #2
0
def main():
    # Run condenser
    run_condenser()

    # Perform preprocessing
    X_train, Y_train, X_test, Y_test, unlab_reviews = prepare_data()

    # Initialize Vectorize_Reviews object and get doc2vec vector representations of reviews
    vectorizer = Vectorize_Reviews(X_train, Y_train, X_test, Y_test,
                                   unlab_reviews)
    train_vecs, Y_train, test_vecs, Y_test = vectorizer.train_doc2vec()

    X_train, X_test, y_train, y_test = train_test_split(train_vecs,
                                                        Y_train,
                                                        test_size=0.5,
                                                        random_state=5)

    k_scores = []

    for k in [5, 7]:
        print(f"Running k={k}")
        knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
        scores = cross_val_score(knn,
                                 X_train,
                                 y_train,
                                 cv=10,
                                 scoring='accuracy')
        k_scores.append([scores.mean(), k])

    scores = sorted(k_scores, key=itemgetter(0), reverse=True)
    file = open("results.txt", "w")
    for tupla in scores:
        txt = "accuracy: " + str(tupla[0]) + "- k: " + str(tupla[1])
        file.write(txt + '\n')
    file.close()
Пример #3
0
 def main(self):
     for i in range(100):
         data = dp.prepare_data(self.infile)
         n = data.shape[0]
         for row in data:
             pred = self.predict(row[:3])
             true = row[3]
             self.update_w(pred, true, row, n)
     self.write_output()
Пример #4
0
 def get_r(self):
     data = dp.prepare_data(self.infile)
     error = 0
     n = 0
     for row in data:
         pred = self.predict(row[:3])
         true = row[3]
         error += (pred - true)**2
         n += 1
     return 0.5 * error / n
Пример #5
0
def train_epoch(data_loaders, models, periodic_interval_batches, vocab):
    num_models = len(models)

    # compute number of batches for an epoch
    sup_batches = len(data_loaders["sup_train"])
    unsup_batches = len(data_loaders["unsup_train"])
    batches_per_epoch = sup_batches + unsup_batches

    # initialize variables to store loss values
    epoch_losses_sup = [0.] * num_models
    epoch_losses_unsup = [0.] * num_models

    # setup the iterators for training data loaders
    sup_iter = iter(data_loaders["sup_train"])
    unsup_iter = iter(data_loaders["unsup_train"])

    # count the number of supervised batches seen in this epoch
    ctr_sup = 0
    for i in tqdm(range(batches_per_epoch)):

        # whether this batch is supervised or not
        is_supervised = (i % periodic_interval_batches == 1) and ctr_sup < sup_batches

        # extract the corresponding batch
        if is_supervised:
            (subs, objs, targets, relations, predicates) = next(sup_iter)
            ctr_sup += 1
        else:
            (subs, objs, targets, relations, predicates) = next(unsup_iter)

        # convert data into torch tensors
        # def prepare_data(batched_subs, batched_objs, batched_targets, batched_relations, batched_predicates, vocab):
        subs, objs, targets, relations, predicates = prepare_data(subs, objs, targets, relations, predicates, vocab)
        # subs = torch.tensor(subs)
        # objs = torch.tensor(objs)
        # targets = torch.tensor(targets)
        # relations = torch.tensor(relations)
        # predicates = torch.tensor(predicates)

        # run the inference for each loss with supervised or un-supervised
        # data as arguments
        for model_id in range(num_models):
            if is_supervised:
                new_loss = models[model_id].step(subs, objs, targets, relations, predicates)
                epoch_losses_sup[model_id] += new_loss
            else:
                new_loss = models[model_id].step(subs, objs, targets)
                epoch_losses_unsup[model_id] += new_loss

    # return the values of all losses
    return epoch_losses_sup, epoch_losses_unsup
def main():
    # Run condenser
    run_condenser()

    # Perform preprocessing
    X_train, Y_train, X_test, Y_test, unlab_reviews = prepare_data()

    # Initialize Vectorize_Reviews object and get doc2vec vector representations of reviews
    vectorizer = Vectorize_Reviews(X_train, Y_train, X_test, Y_test,
                                   unlab_reviews)
    train_vecs, Y_train, test_vecs, Y_test = vectorizer.train_doc2vec()

    # Initialize Classify_Reviews object and train logistic regression classifier on doc2vec features
    classifier = Classify_Reviews(train_vecs, Y_train, test_vecs, Y_test)
    classifier.train_model()

    # Validate classifier
    classifier.validate_model()
Пример #7
0
def evaluate(generator, eval_data_loader, vocab, sample_size, batch_size):
    predict_fn = Predictive(generator.model, generator.guide, num_samples=sample_size, return_sites=('v', 'r', 'z'))

    num_batches = len(eval_data_loader) / batch_size
    eval_iter = iter(eval_data_loader)
    predict_df = {
        'subject': [],
        'object': [],
        'target': [],
        'true predicate': [],
        'true relation': [],
        'predicted predicates': [],
        'predicted relaitons': []
    }
    for i in range(num_batches):
        subs, objs, targets, relations, predicates = next(eval_iter)
        for j in range(batch_size):
            predict_df['subject'].append(subs[j])
            predict_df['object'].append(objs[j])
            predict_df['target'].append(targets[j])
            predict_df['true predicate'].append(predicates[j])
            predict_df['true relation'].append(relations[j])
        subs, objs, targets, relations, predicates = prepare_data(subs, objs, targets, relations, predicates, vocab)
        batch_pred_samples = predict_fn(subs, objs, targets, relations, predicates)['v'].view(batch_size, -1)
        batch_rel_samples = predict_fn(subs, objs, targets, relations, predicates)['r'].view(batch_size, -1)
        assert batch_pred_samples.shape[-1] == sample_size and batch_rel_samples.shape[-1] == sample_size
        for j in range(batch_size):
            # there're 'sample_size' sampled predicates and relations from the guide posterior
            sampled_predicates_idx = batch_pred_samples[j]
            sampled_rel_idx = batch_rel_samples[j]
            sampled_predicates = [vocab.i2w[pred_idx_tensor.item()] for pred_idx_tensor in sampled_predicates_idx]
            sampled_relations = [vocab.i2w[rel_idx_tensor.item()] for rel_idx_tensor in sampled_rel_idx]

            predict_df['predicted predicates'].append(sampled_predicates)
            predict_df['predicted relations'].append(sampled_relations)

    return predict_df
# import pandas as pd
# from functools import reduce
import numpy as np

import util
import data_prep

df = util.read_data("binary.csv", "../data/")

features, targets, features_test, targets_test = data_prep.prepare_data(df)

n_records, n_features = features.shape
# Use to same seed to make debugging easier
np.random.seed(42)
last_loss = None
weights = np.random.normal(scale=1 / n_features ** .5, size=n_features)

# Neural Network hyper-parameters
epochs = 10000
learn_rate = 0.1

for e in range(epochs):
    del_w = np.zeros(weights.shape)
    for x, y in zip(features.values, targets):
        output = util.sigmoid(np.dot(x, weights))

        error = y - output

        error_term = error * output * (1 - output)

        # The gradient descent step, the error times the gradient times the inputs
Пример #9
0
import tensorflow as tf
from data_prep import prepare_data

(train_images, train_labels, test_images, test_labels) = prepare_data()

model = tf.keras.models.load_model('./model/')

test_loss, test_acc = model.evaluate(test_images, test_labels)
print('\nTest accuracy: ', test_acc)
print('\nTest loss: ', test_loss)
# Modelling imports from Sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve
from cm_plot import plot_cm

from data_prep import prepare_data

# Visualization
import matplotlib.style as style
import matplotlib.pyplot as plt
style.use('seaborn')

# ------------------------------------------------------------------------
df_final = prepare_data()

# Train Test Split and set targets
train, test = train_test_split(df_final, test_size=.2, random_state=10)

feats = [c for c in train.columns if c not in ['Churn']]
target = ['Churn']

train_x = train[feats]
train_y = np.ravel(train[target])
test_x = test[feats]
test_y = np.ravel(test[target])
# ------------------------------------------------------------------------
# Train model and evaluate
clf = LogisticRegression(solver='liblinear')
param_grid = {'C': np.logspace(-4, 4, 100, base=10)}
Пример #11
0
        row = self.df.loc[idx]
        img_fname, img_label = row['image'], row['label']
        img = Image.open(img_fname)
        if self.transform:
            img = self.transform(img)
        if img.shape[0] == 1:
            img = img.repeat(3, 1, 1)

# Defining some transforms


train_transform = T.Compose([T.Resize((256, 256)),
                             T.RandomAffine(30),
                            T.ColorJitter(),
                            T.ToTensor()])

val_transform = T.Compose([T.Resize(256, 256),
                           T.ToTensor()])

train_df, val_df = prepare_data()

train_dataset = PneumoniaDataset(train_df, transform=train_transform)
val_dataset = PneumoniaDataset(val_df, transform=val_transform)


# PyTorch Data Loaders
train_dl = DataLoader(train_dataset, config.BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
val_dl = DataLoader(val_dataset, config.BATCH_SIZE*2, shuffle=True, num_workers=4, pin_memory=True)


The architecure of the model is available in the 'compatibility_siamese_model_architecture.png' file.
"""
from data import polyvore_dataset, DataGeneratorSiamese
from utils import Config
import data_prep
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, Lambda, Input
from tensorflow.keras import models, optimizers
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model
import numpy as np

if __name__ == '__main__':

    train_pairwise_file, valid_pairwise_file = data_prep.prepare_data()
    train_pairwise_file = train_pairwise_file
    valid_pairwise_file = valid_pairwise_file
    dataset = polyvore_dataset()
    transforms = dataset.get_data_transforms()
    X_train, y_train = dataset.create_compatibility_dataset2(
        train_pairwise_file)
    X_test, y_test = dataset.create_compatibility_dataset2(valid_pairwise_file)

    if Config['debug']:
        train_set = (X_train[:100], y_train[:100], transforms['train'])
        test_set = (X_test[:100], y_test[:100], transforms['test'])
        dataset_size = {'train': 100, 'test': 100}
    else:
        train_set = (X_train, y_train, transforms['train'])
        test_set = (X_test, y_test, transforms['test'])
Пример #13
0
if __name__ == "__main__":
    
  """ Read in parameters """
  with open('main_parameters.json') as jsonfile:
    params = json.load(jsonfile)

  print(f"Parameters used in training: {params}")
  
  
  """ Get datasets and weights for optimization """
  
  train_data, test_data, weights =  prepare_data(
                                                params['paths'],
                                                params['number_of_subjects'],
                                                params['subsample'],
                                                params['data_format'],
                                                params['partitions']
                                                )
  from collections import Counter                                              
  print(Counter(test_data.labels))
  

  """ Train """
  fc_dim = train_data[0][0].shape[0]
  #model = Net(fc_dim)
  model = LogisticRegression(fc_dim)
  
  print(f"Optimization parameters: {params['optimization']}")
  
  
Пример #14
0
def main():
    # Data preparation
    data = pickle.load(open(config.data_path + config.train_data_fname, 'rb'))
    # Unpack the data
    u_true, r_true, n_tot, U_full, A_full, R_full, AT_full = data_prep.prepare_data(
        data)
    clusters = list(n_tot.keys())
    c = config.c

    # Train indices, validation indices, test indices
    train_indices = pickle.load(
        open(config.data_path + config.train_indices_fname, 'rb'))
    valid_indices = pickle.load(
        open(config.data_path + config.valid_indices_fname, 'rb'))
    test_indices = pickle.load(
        open(config.data_path + config.test_indices_fname, 'rb'))

    indexing_list = [0]

    for i, cluster_id in enumerate(clusters[:-1]):
        num_samples = len(train_indices[cluster_id]) + len(
            valid_indices[cluster_id]) + len(test_indices[cluster_id])
        indexing_list.append(num_samples)

    # Get data for the given cluster
    new_data = data_prep.prep_data(u_true[c], r_true[c], R_full[c], U_full[c],
                                   A_full[c], AT_full[c])

    if config.sr_type == 'user':
        train_fname = config.train_data_resampled_fname + str(
            config.c) + '.pkl'
    else:
        train_fname = config.train_data_resampled_fname + 'agent' + '.pkl'
    train_data = pickle.load(open(config.data_path + train_fname, 'rb'))

    tr_ind = train_indices[c]

    if config.testing:
        if config.neural:
            eval_data_type = config.data_types[2]

            je, trainer, _, _ = train_only(config.test_hidden_dim,
                                           config.test_leaky_slope,
                                           config.test_thresh,
                                           config.test_epochs, train_data)

            if c == 'all':
                for i, cluster_id in enumerate(clusters[:-1]):
                    val_ind = [
                        ind + indexing_list[i]
                        for ind in valid_indices[cluster_id]
                    ]
                    te_ind = [
                        ind + indexing_list[i]
                        for ind in test_indices[cluster_id]
                    ]

                    eval_data = get_eval_data(tr_ind, val_ind, te_ind,
                                              new_data, eval_data_type)
                    loss = eval_only(eval_data, je, trainer, save_model=False)
                    print("Loss for cluster-%d is %.3f" % (cluster_id, loss))

            val_ind = valid_indices[c]
            te_ind = test_indices[c]
            eval_data = get_eval_data(tr_ind, val_ind, te_ind, new_data,
                                      eval_data_type)
            loss = eval_only(eval_data, je, trainer, save_model=True)
            print("Overall loss is %.3f" % loss)

        else:
            eval_data_type = config.data_types[2]

            X_train, Y_train = prepare_training_data_non_nn(train_data)
            print("Train data shape: ", X_train.shape)
            print("Train data shape: ", Y_train.shape)
            clf = train_only_non_nn(X_train, Y_train)

            if c == 'all':
                for i, cluster_id in enumerate(clusters[:-1]):
                    val_ind = valid_indices[cluster_id]
                    te_ind = [
                        ind + indexing_list[i]
                        for ind in test_indices[cluster_id]
                    ]
                    eval_data = get_eval_data(tr_ind, val_ind, te_ind,
                                              new_data, eval_data_type)
                    X_eval, Y_eval = prepare_training_data_non_nn(eval_data)
                    print("Eval data shape: ", X_eval.shape)
                    print("Eval data shape: ", Y_eval.shape)
                    eval_only_non_nn(clf, X_eval, Y_eval, print_info=True)

            val_ind = valid_indices[c]
            te_ind = test_indices[c]
            eval_data = get_eval_data(tr_ind, val_ind, te_ind, new_data,
                                      eval_data_type)
            X_eval, Y_eval = prepare_training_data_non_nn(eval_data)
            print("Eval data shape: ", X_eval.shape)
            print("Eval data shape: ", Y_eval.shape)
            eval_only_non_nn(clf, X_eval, Y_eval, print_info=True)

    else:
        val_ind = valid_indices[c]
        te_ind = test_indices[c]
        eval_data_type = config.data_types[1]
        eval_data = get_eval_data(tr_ind, val_ind, te_ind, new_data,
                                  eval_data_type)

        if not config.neural:
            train_and_evaluate_non_nn(train_data, eval_data)

        else:
            if config.model_type == 're':
                loss = float("inf")

                thresh = 0.4
                best_leaky_slope = None
                best_hidden_dim = None
                best_epoch = None

                for leaky_iter in range(config.num_leaky_iter):
                    leaky_slope = config.leaky_min + leaky_iter * config.leaky_step
                    for hidden_dim in config.hidden_sizes:
                        print(
                            "###########################################################"
                        )
                        print("Leaky slope: %.2f, Hidden dim: %d" %
                              (leaky_slope, hidden_dim))
                        print(
                            "###########################################################"
                        )

                        seed = 0
                        torch.manual_seed(seed)
                        np.random.seed(seed)
                        random.seed(seed)

                        _, _, temp_loss, temp_epoch = train_only(
                            hidden_dim, leaky_slope, thresh, config.n_epochs,
                            train_data, eval_data)

                        loss = min(loss, temp_loss)
                        if loss == temp_loss:
                            best_leaky_slope = leaky_slope
                            best_hidden_dim = hidden_dim
                            best_epoch = temp_epoch

                print(
                    "###########################################################"
                )
                print(
                    "Best loss: %.3f, best leaky slope: %.2f, best hidden dim: %d, best epoch: %d"
                    % (loss, best_leaky_slope, best_hidden_dim, best_epoch))

            else:
                loss = float("inf")

                best_epoch = None
                best_thresh = None
                best_leaky_slope = None
                best_hidden_dim = None

                for thresh in config.thresh:
                    for leaky_iter in range(config.num_leaky_iter):
                        leaky_slope = config.leaky_min + leaky_iter * config.leaky_step
                        for hidden_dim in config.hidden_sizes:
                            print(
                                "###########################################################"
                            )
                            print(
                                "Threshold: %.2f, Leaky slope: %.2f, Hidden dim: %d"
                                % (thresh, leaky_slope, hidden_dim))
                            print(
                                "###########################################################"
                            )

                            seed = 0
                            torch.manual_seed(seed)
                            np.random.seed(seed)
                            random.seed(seed)

                            _, _, temp_loss, temp_epoch = train_only(
                                hidden_dim, leaky_slope, thresh,
                                config.n_epochs, train_data, eval_data)

                            loss = min(loss, temp_loss)
                            if loss == temp_loss:
                                best_leaky_slope = leaky_slope
                                best_hidden_dim = hidden_dim
                                best_epoch = temp_epoch
                                best_thresh = thresh

                print(
                    "###########################################################"
                )
                print(
                    "Best loss: %.3f, best n_epochs: %d, best thresh: %.2f, best leaky slope: %.2f, best hidden dim: %d"
                    % (loss, best_epoch, best_thresh, best_leaky_slope,
                       best_hidden_dim))