def set_data(self, train, test, batch_size=None, maxlen=200, \ weight_dir = None, data_name=None, num_classes=6): self.train_files = train self.test_files = test if batch_size: self.batch_size = batch_size if data_name == "vac_data": self.train_data, self.train_labels, self.test_data, \ self.test_labels, \ self.test_sentences, \ self.labels, \ self.embedding_matrix, \ self.tokenizer = prepare_vaccine_data( \ train = self.train_files, \ test = self.test_files) else: self.train_data, self.train_labels, self.test_data, \ self.test_sentences, \ self.labels, \ self.embedding_matrix, \ self.tokenizer = prepare_data( \ train = self.train_files, \ test = self.test_files) self.data_iden = data_name if weight_dir: self.weight_dir = weight_dir # save in dir self.colnames = ['text'] + self.labels self.model = self.define(weight_dir, num_classes=num_classes)
def main(): # Run condenser run_condenser() # Perform preprocessing X_train, Y_train, X_test, Y_test, unlab_reviews = prepare_data() # Initialize Vectorize_Reviews object and get doc2vec vector representations of reviews vectorizer = Vectorize_Reviews(X_train, Y_train, X_test, Y_test, unlab_reviews) train_vecs, Y_train, test_vecs, Y_test = vectorizer.train_doc2vec() X_train, X_test, y_train, y_test = train_test_split(train_vecs, Y_train, test_size=0.5, random_state=5) k_scores = [] for k in [5, 7]: print(f"Running k={k}") knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean') scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy') k_scores.append([scores.mean(), k]) scores = sorted(k_scores, key=itemgetter(0), reverse=True) file = open("results.txt", "w") for tupla in scores: txt = "accuracy: " + str(tupla[0]) + "- k: " + str(tupla[1]) file.write(txt + '\n') file.close()
def main(self): for i in range(100): data = dp.prepare_data(self.infile) n = data.shape[0] for row in data: pred = self.predict(row[:3]) true = row[3] self.update_w(pred, true, row, n) self.write_output()
def get_r(self): data = dp.prepare_data(self.infile) error = 0 n = 0 for row in data: pred = self.predict(row[:3]) true = row[3] error += (pred - true)**2 n += 1 return 0.5 * error / n
def train_epoch(data_loaders, models, periodic_interval_batches, vocab): num_models = len(models) # compute number of batches for an epoch sup_batches = len(data_loaders["sup_train"]) unsup_batches = len(data_loaders["unsup_train"]) batches_per_epoch = sup_batches + unsup_batches # initialize variables to store loss values epoch_losses_sup = [0.] * num_models epoch_losses_unsup = [0.] * num_models # setup the iterators for training data loaders sup_iter = iter(data_loaders["sup_train"]) unsup_iter = iter(data_loaders["unsup_train"]) # count the number of supervised batches seen in this epoch ctr_sup = 0 for i in tqdm(range(batches_per_epoch)): # whether this batch is supervised or not is_supervised = (i % periodic_interval_batches == 1) and ctr_sup < sup_batches # extract the corresponding batch if is_supervised: (subs, objs, targets, relations, predicates) = next(sup_iter) ctr_sup += 1 else: (subs, objs, targets, relations, predicates) = next(unsup_iter) # convert data into torch tensors # def prepare_data(batched_subs, batched_objs, batched_targets, batched_relations, batched_predicates, vocab): subs, objs, targets, relations, predicates = prepare_data(subs, objs, targets, relations, predicates, vocab) # subs = torch.tensor(subs) # objs = torch.tensor(objs) # targets = torch.tensor(targets) # relations = torch.tensor(relations) # predicates = torch.tensor(predicates) # run the inference for each loss with supervised or un-supervised # data as arguments for model_id in range(num_models): if is_supervised: new_loss = models[model_id].step(subs, objs, targets, relations, predicates) epoch_losses_sup[model_id] += new_loss else: new_loss = models[model_id].step(subs, objs, targets) epoch_losses_unsup[model_id] += new_loss # return the values of all losses return epoch_losses_sup, epoch_losses_unsup
def main(): # Run condenser run_condenser() # Perform preprocessing X_train, Y_train, X_test, Y_test, unlab_reviews = prepare_data() # Initialize Vectorize_Reviews object and get doc2vec vector representations of reviews vectorizer = Vectorize_Reviews(X_train, Y_train, X_test, Y_test, unlab_reviews) train_vecs, Y_train, test_vecs, Y_test = vectorizer.train_doc2vec() # Initialize Classify_Reviews object and train logistic regression classifier on doc2vec features classifier = Classify_Reviews(train_vecs, Y_train, test_vecs, Y_test) classifier.train_model() # Validate classifier classifier.validate_model()
def evaluate(generator, eval_data_loader, vocab, sample_size, batch_size): predict_fn = Predictive(generator.model, generator.guide, num_samples=sample_size, return_sites=('v', 'r', 'z')) num_batches = len(eval_data_loader) / batch_size eval_iter = iter(eval_data_loader) predict_df = { 'subject': [], 'object': [], 'target': [], 'true predicate': [], 'true relation': [], 'predicted predicates': [], 'predicted relaitons': [] } for i in range(num_batches): subs, objs, targets, relations, predicates = next(eval_iter) for j in range(batch_size): predict_df['subject'].append(subs[j]) predict_df['object'].append(objs[j]) predict_df['target'].append(targets[j]) predict_df['true predicate'].append(predicates[j]) predict_df['true relation'].append(relations[j]) subs, objs, targets, relations, predicates = prepare_data(subs, objs, targets, relations, predicates, vocab) batch_pred_samples = predict_fn(subs, objs, targets, relations, predicates)['v'].view(batch_size, -1) batch_rel_samples = predict_fn(subs, objs, targets, relations, predicates)['r'].view(batch_size, -1) assert batch_pred_samples.shape[-1] == sample_size and batch_rel_samples.shape[-1] == sample_size for j in range(batch_size): # there're 'sample_size' sampled predicates and relations from the guide posterior sampled_predicates_idx = batch_pred_samples[j] sampled_rel_idx = batch_rel_samples[j] sampled_predicates = [vocab.i2w[pred_idx_tensor.item()] for pred_idx_tensor in sampled_predicates_idx] sampled_relations = [vocab.i2w[rel_idx_tensor.item()] for rel_idx_tensor in sampled_rel_idx] predict_df['predicted predicates'].append(sampled_predicates) predict_df['predicted relations'].append(sampled_relations) return predict_df
# import pandas as pd # from functools import reduce import numpy as np import util import data_prep df = util.read_data("binary.csv", "../data/") features, targets, features_test, targets_test = data_prep.prepare_data(df) n_records, n_features = features.shape # Use to same seed to make debugging easier np.random.seed(42) last_loss = None weights = np.random.normal(scale=1 / n_features ** .5, size=n_features) # Neural Network hyper-parameters epochs = 10000 learn_rate = 0.1 for e in range(epochs): del_w = np.zeros(weights.shape) for x, y in zip(features.values, targets): output = util.sigmoid(np.dot(x, weights)) error = y - output error_term = error * output * (1 - output) # The gradient descent step, the error times the gradient times the inputs
import tensorflow as tf from data_prep import prepare_data (train_images, train_labels, test_images, test_labels) = prepare_data() model = tf.keras.models.load_model('./model/') test_loss, test_acc = model.evaluate(test_images, test_labels) print('\nTest accuracy: ', test_acc) print('\nTest loss: ', test_loss)
# Modelling imports from Sklearn from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve from cm_plot import plot_cm from data_prep import prepare_data # Visualization import matplotlib.style as style import matplotlib.pyplot as plt style.use('seaborn') # ------------------------------------------------------------------------ df_final = prepare_data() # Train Test Split and set targets train, test = train_test_split(df_final, test_size=.2, random_state=10) feats = [c for c in train.columns if c not in ['Churn']] target = ['Churn'] train_x = train[feats] train_y = np.ravel(train[target]) test_x = test[feats] test_y = np.ravel(test[target]) # ------------------------------------------------------------------------ # Train model and evaluate clf = LogisticRegression(solver='liblinear') param_grid = {'C': np.logspace(-4, 4, 100, base=10)}
row = self.df.loc[idx] img_fname, img_label = row['image'], row['label'] img = Image.open(img_fname) if self.transform: img = self.transform(img) if img.shape[0] == 1: img = img.repeat(3, 1, 1) # Defining some transforms train_transform = T.Compose([T.Resize((256, 256)), T.RandomAffine(30), T.ColorJitter(), T.ToTensor()]) val_transform = T.Compose([T.Resize(256, 256), T.ToTensor()]) train_df, val_df = prepare_data() train_dataset = PneumoniaDataset(train_df, transform=train_transform) val_dataset = PneumoniaDataset(val_df, transform=val_transform) # PyTorch Data Loaders train_dl = DataLoader(train_dataset, config.BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True) val_dl = DataLoader(val_dataset, config.BATCH_SIZE*2, shuffle=True, num_workers=4, pin_memory=True)
The architecure of the model is available in the 'compatibility_siamese_model_architecture.png' file. """ from data import polyvore_dataset, DataGeneratorSiamese from utils import Config import data_prep from tensorflow.keras.models import Model from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, Lambda, Input from tensorflow.keras import models, optimizers import tensorflow as tf from tensorflow.keras import backend as K from tensorflow.keras.utils import plot_model import numpy as np if __name__ == '__main__': train_pairwise_file, valid_pairwise_file = data_prep.prepare_data() train_pairwise_file = train_pairwise_file valid_pairwise_file = valid_pairwise_file dataset = polyvore_dataset() transforms = dataset.get_data_transforms() X_train, y_train = dataset.create_compatibility_dataset2( train_pairwise_file) X_test, y_test = dataset.create_compatibility_dataset2(valid_pairwise_file) if Config['debug']: train_set = (X_train[:100], y_train[:100], transforms['train']) test_set = (X_test[:100], y_test[:100], transforms['test']) dataset_size = {'train': 100, 'test': 100} else: train_set = (X_train, y_train, transforms['train']) test_set = (X_test, y_test, transforms['test'])
if __name__ == "__main__": """ Read in parameters """ with open('main_parameters.json') as jsonfile: params = json.load(jsonfile) print(f"Parameters used in training: {params}") """ Get datasets and weights for optimization """ train_data, test_data, weights = prepare_data( params['paths'], params['number_of_subjects'], params['subsample'], params['data_format'], params['partitions'] ) from collections import Counter print(Counter(test_data.labels)) """ Train """ fc_dim = train_data[0][0].shape[0] #model = Net(fc_dim) model = LogisticRegression(fc_dim) print(f"Optimization parameters: {params['optimization']}")
def main(): # Data preparation data = pickle.load(open(config.data_path + config.train_data_fname, 'rb')) # Unpack the data u_true, r_true, n_tot, U_full, A_full, R_full, AT_full = data_prep.prepare_data( data) clusters = list(n_tot.keys()) c = config.c # Train indices, validation indices, test indices train_indices = pickle.load( open(config.data_path + config.train_indices_fname, 'rb')) valid_indices = pickle.load( open(config.data_path + config.valid_indices_fname, 'rb')) test_indices = pickle.load( open(config.data_path + config.test_indices_fname, 'rb')) indexing_list = [0] for i, cluster_id in enumerate(clusters[:-1]): num_samples = len(train_indices[cluster_id]) + len( valid_indices[cluster_id]) + len(test_indices[cluster_id]) indexing_list.append(num_samples) # Get data for the given cluster new_data = data_prep.prep_data(u_true[c], r_true[c], R_full[c], U_full[c], A_full[c], AT_full[c]) if config.sr_type == 'user': train_fname = config.train_data_resampled_fname + str( config.c) + '.pkl' else: train_fname = config.train_data_resampled_fname + 'agent' + '.pkl' train_data = pickle.load(open(config.data_path + train_fname, 'rb')) tr_ind = train_indices[c] if config.testing: if config.neural: eval_data_type = config.data_types[2] je, trainer, _, _ = train_only(config.test_hidden_dim, config.test_leaky_slope, config.test_thresh, config.test_epochs, train_data) if c == 'all': for i, cluster_id in enumerate(clusters[:-1]): val_ind = [ ind + indexing_list[i] for ind in valid_indices[cluster_id] ] te_ind = [ ind + indexing_list[i] for ind in test_indices[cluster_id] ] eval_data = get_eval_data(tr_ind, val_ind, te_ind, new_data, eval_data_type) loss = eval_only(eval_data, je, trainer, save_model=False) print("Loss for cluster-%d is %.3f" % (cluster_id, loss)) val_ind = valid_indices[c] te_ind = test_indices[c] eval_data = get_eval_data(tr_ind, val_ind, te_ind, new_data, eval_data_type) loss = eval_only(eval_data, je, trainer, save_model=True) print("Overall loss is %.3f" % loss) else: eval_data_type = config.data_types[2] X_train, Y_train = prepare_training_data_non_nn(train_data) print("Train data shape: ", X_train.shape) print("Train data shape: ", Y_train.shape) clf = train_only_non_nn(X_train, Y_train) if c == 'all': for i, cluster_id in enumerate(clusters[:-1]): val_ind = valid_indices[cluster_id] te_ind = [ ind + indexing_list[i] for ind in test_indices[cluster_id] ] eval_data = get_eval_data(tr_ind, val_ind, te_ind, new_data, eval_data_type) X_eval, Y_eval = prepare_training_data_non_nn(eval_data) print("Eval data shape: ", X_eval.shape) print("Eval data shape: ", Y_eval.shape) eval_only_non_nn(clf, X_eval, Y_eval, print_info=True) val_ind = valid_indices[c] te_ind = test_indices[c] eval_data = get_eval_data(tr_ind, val_ind, te_ind, new_data, eval_data_type) X_eval, Y_eval = prepare_training_data_non_nn(eval_data) print("Eval data shape: ", X_eval.shape) print("Eval data shape: ", Y_eval.shape) eval_only_non_nn(clf, X_eval, Y_eval, print_info=True) else: val_ind = valid_indices[c] te_ind = test_indices[c] eval_data_type = config.data_types[1] eval_data = get_eval_data(tr_ind, val_ind, te_ind, new_data, eval_data_type) if not config.neural: train_and_evaluate_non_nn(train_data, eval_data) else: if config.model_type == 're': loss = float("inf") thresh = 0.4 best_leaky_slope = None best_hidden_dim = None best_epoch = None for leaky_iter in range(config.num_leaky_iter): leaky_slope = config.leaky_min + leaky_iter * config.leaky_step for hidden_dim in config.hidden_sizes: print( "###########################################################" ) print("Leaky slope: %.2f, Hidden dim: %d" % (leaky_slope, hidden_dim)) print( "###########################################################" ) seed = 0 torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) _, _, temp_loss, temp_epoch = train_only( hidden_dim, leaky_slope, thresh, config.n_epochs, train_data, eval_data) loss = min(loss, temp_loss) if loss == temp_loss: best_leaky_slope = leaky_slope best_hidden_dim = hidden_dim best_epoch = temp_epoch print( "###########################################################" ) print( "Best loss: %.3f, best leaky slope: %.2f, best hidden dim: %d, best epoch: %d" % (loss, best_leaky_slope, best_hidden_dim, best_epoch)) else: loss = float("inf") best_epoch = None best_thresh = None best_leaky_slope = None best_hidden_dim = None for thresh in config.thresh: for leaky_iter in range(config.num_leaky_iter): leaky_slope = config.leaky_min + leaky_iter * config.leaky_step for hidden_dim in config.hidden_sizes: print( "###########################################################" ) print( "Threshold: %.2f, Leaky slope: %.2f, Hidden dim: %d" % (thresh, leaky_slope, hidden_dim)) print( "###########################################################" ) seed = 0 torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) _, _, temp_loss, temp_epoch = train_only( hidden_dim, leaky_slope, thresh, config.n_epochs, train_data, eval_data) loss = min(loss, temp_loss) if loss == temp_loss: best_leaky_slope = leaky_slope best_hidden_dim = hidden_dim best_epoch = temp_epoch best_thresh = thresh print( "###########################################################" ) print( "Best loss: %.3f, best n_epochs: %d, best thresh: %.2f, best leaky slope: %.2f, best hidden dim: %d" % (loss, best_epoch, best_thresh, best_leaky_slope, best_hidden_dim))