def main(args=sys.argv[1:]): args = parse_args() np.random.seed(args.seed) print(args) X, y = read_data(args.in_file, has_header=True) print(X.shape) print(y.shape) print(y.mean()) y = y.reshape(y.size, 1) if args.center_y: y -= np.mean(y) if args.scale_y: y /= np.sqrt(np.var(y)) shuffled_idx = np.random.choice(y.size, size=y.size, replace=False) shuff_X = X[shuffled_idx, :] shuff_y = y[shuffled_idx] n_train = y.size - int(y.size * args.test_proportion) train_data = Dataset(shuff_X[:n_train, :], shuff_y[:n_train, :], shuff_y[:n_train, :]) test_data = Dataset(shuff_X[n_train:, :], shuff_y[n_train:, :], shuff_y[n_train:, :]) print("data_file %s" % args.out_file) with open(args.out_file, "wb") as f: pickle.dump({"train": train_data, "test": test_data}, f)
def read_input_data(args): import pandas as pd from data_generator import Dataset if args.data_index_file is None: with open(args.data_file, "rb") as f: all_data = pickle.load(f) train_data = all_data["train"] test_data = all_data["test"] else: with open(args.data_index_file, "rb") as f: data_indices = pickle.load(f) train_indices = data_indices["train"] test_indices = data_indices["test"] print(train_indices) print(test_indices) Xdata = pd.read_csv(args.data_X_file).values[:, 1:] rand_cols = np.random.choice( Xdata.shape[1], size=min(Xdata.shape[1], 5000), replace=False) Xdata = Xdata[:, rand_cols] print(Xdata) ydata = pd.read_csv(args.data_y_file).values print(ydata) train_data = Dataset( Xdata[train_indices, :], ydata[train_indices, :], ydata[train_indices, :]) test_data = Dataset( Xdata[test_indices, :], ydata[test_indices, :], ydata[test_indices, :]) return train_data, test_data
def train(): tokenizer = BertWordPieceTokenizer( r'C:\Users\David\Documents\Machine_learning\NLP\CardioExplorer\vocab.txt', lowercase=True) args = parser.parse_args() batch_size = args.batch_size num_epochs = args.num_epochs file_path = args.file_path file_path = r'C:\Users\David\Documents\Machine_learning\NLP\CardioExplorer\abstracts_100.csv' if file_path is None: ValueError("A file path to documents must be provided") sentence_config, document_config = set_model_config(args, tokenizer) dataset = Dataset(file_path, tokenizer, sentence_config.max_position_embeddings, document_config.max_position_embeddings, mask=True) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collate_fun) model = SmithModel(sentence_config, document_config) for epoch in range(num_epochs): for iteration, (token_ids, attention_mask, token_type_ids, label_ids, split_idx) in enumerate(dataloader): token_ids_stacked = torch.cat(token_ids) label_ids_stacked = torch.cat(label_ids) attention_mask = torch.cat(attention_mask) token_type_ids = torch.cat(token_type_ids) output = model(input_ids=token_ids_stacked, attention_mask=attention_mask, token_type_ids=token_type_ids, split_idx=split_idx, labels=label_ids_stacked) loss_sp = output[0] loss_wp = output[1] loss = loss_sp + loss_wp if iteration % 10 == 0: print("Iteration {}: Loss: {}".format(iteration, loss)) loss.backward() optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad()
def main(args): print("+++ main") # Prepare experiment random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Create dataset dataset = Dataset(args.batch_size, args.nopad, device) train_iterator, valid_iterator, test_iterator = dataset.create_iterator() args.input_size = len(dataset.source_field.vocab) args.output_size = len(dataset.target_field.vocab) print(f"Source vocab size = {len(dataset.source_field.vocab)}") print(f"Target vocab size = {len(dataset.target_field.vocab)}") args.sos_idx = dataset.target_field.vocab.stoi[args.sos] args.eos_idx = dataset.target_field.vocab.stoi[args.eos] args.pad_idx = dataset.target_field.vocab.stoi[args.pad] # Create model model = create_model(args.model_type, args, device).to(device) init_function = create_init_function(args.init_type) model.apply(init_function) total_params = model.count_parameters() print(f"Total number of parameters = {total_params}") # Prepare training optimizer = create_optimizer(args.optim_type, args.learning_rate, model.parameters(), args) criterion = nn.CrossEntropyLoss(ignore_index=args.pad_idx) # Training training(model, train_iterator, valid_iterator, optimizer, args.optim_type, criterion, args.num_epochs, args.clip, args.nopad, device) print("--- main")
def return_data_set_generator(): base_path = '../data/processed_poke_3' data_partition_name = 'test' label_path = os.path.join(base_path, data_partition_name, 'labels.json') ids_path = os.path.join(base_path, data_partition_name, 'ids.json') with open(label_path) as json_file: labels = json.load(json_file) with open(ids_path) as json_file: ids = json.load(json_file) test_dataset = Dataset(ids, labels, partition='test', base_path=base_path) params = {'batch_size': 1, 'shuffle': True, 'num_workers': 1} test_data_generator = data.DataLoader(test_dataset, **params) return test_data_generator
sentence_config.vocab_size = tokenizer.get_vocab_size() sentence_config.num_hidden_layers = 6 sentence_config.hidden_size = 256 sentence_config.num_attention_heads = 4 sentence_config.max_position_embeddings = sentence_block_length # sentence_block_length document_config = BertConfig() document_config.vocab_size = tokenizer.get_vocab_size() document_config.num_hidden_layers = 3 document_config.hidden_size = 256 document_config.num_attention_heads = 4 document_config.max_position_embeddings = max_sentence_blocks # sentence_block_length dataset = Dataset(file_path, tokenizer, sentence_block_length, max_sentence_blocks, mask=True) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, collate_fn=collate_fun) #sentence_model = AutoModel.from_config(sentence_config) #document_model = AutoModel.from_config(document_config) sentence_model = BertForMaskedLM(sentence_config) document_model = BertModel(document_config) dense1 = torch.nn.Linear(sentence_config.hidden_size, sentence_config.hidden_size)
def train_generator_cross_valid(datapath, batch_size, lr, num_epochs, output, prot): input_size = 400 hidden_size = 128 output_size = 2 # load selected features list with open('permu_feature_importance.json') as json_file: feature_importance = json.load(json_file) feature_list = [] score_list = [] for name, improtance in feature_importance.items(): feature_list.append(name) score_list.append(improtance) feature_tmp = feature_list[0:400] feature_sample = list(map(int, feature_tmp)) params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 4} stage_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] logs = {} for stage in stage_list: path_train = path_valid = datapath new_train_list, new_valid_list, labels = split_train_valid(stage, prot) print(len(new_train_list), len(new_valid_list)) print(path_train, path_valid) partition = {"train": new_train_list, "validation": new_valid_list} # Generators training_set = Dataset(partition['train'], labels, path_train, feature_sample) training_generator = data.DataLoader(training_set, **params) validation_set = Dataset(partition['validation'], labels, path_valid, feature_sample) validation_generator = data.DataLoader(validation_set, **params) print('Training data is ready') device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') mlp = mlp_model(input_size, hidden_size, output_size) mlp = mlp.to(device) print(mlp.parameters) # optimizer = torch.optim.Adam(mlp.parameters(), lr=lr, weight_decay=0.0001) optimizer = torch.optim.SGD(mlp.parameters(), lr=lr, momentum=0.9, weight_decay=0.00001) # optimizer = torch.optim.Adam(mlp.parameters(), lr=lr) criterion = torch.nn.CrossEntropyLoss() key_words = [ 'train_loss_' + str(stage), 'train_accuracy_' + str(stage), 'val_loss_' + str(stage), 'val_accuracy_' + str(stage) ] logs[key_words[0]] = [] logs[key_words[1]] = [] logs[key_words[2]] = [] logs[key_words[3]] = [] best_val_loss = 9999999 if not os.path.exists(output): os.makedirs(output) best_saved = str(stage) + '_mlp_supertarget_' + prot + '.pt' best_path = output + best_saved for epoch in range(num_epochs): print(epoch) train_acc_sum = 0 train_loss_sum = 0.0 val_acc_sum = 0 val_loss_sum = 0.0 mlp.train() for train_inputs, train_labels in training_generator: train_inputs, train_labels = train_inputs.to( device), train_labels.to(device) train_outputs = mlp(train_inputs.float()) train_loss = criterion(train_outputs, train_labels) optimizer.zero_grad() # zero the gradient buffer train_loss.backward() optimizer.step() _, train_predicted = torch.max( train_outputs, 1) # find the max of softmax and map the predicted list train_loss_sum += train_loss.detach() * train_inputs.size(0) train_acc_sum += (train_predicted == train_labels.data ).sum() # different from CPU # print(train_loss_sum.item()) # print(train_acc_sum.item()) train_loss_epoch = train_loss_sum.item() / len(training_set) train_acc_epoch = train_acc_sum.item() / len(training_set) if (epoch + 1) % 1 == 0: print('Epoch [{}/{}], Training Loss:{}, Training Accuracy:{}'. format(epoch + 1, num_epochs, train_loss_epoch, train_acc_epoch)) logs[key_words[0]].append(train_loss_epoch) logs[key_words[1]].append(train_acc_epoch) mlp.eval() torch.no_grad() for val_inputs, val_labels in validation_generator: val_inputs, val_labels = val_inputs.to(device), val_labels.to( device) val_outputs = mlp(val_inputs.float()) val_loss = criterion(val_outputs, val_labels) _, val_predicted = torch.max( val_outputs, 1) # find the max of softmax and map the predicted list val_loss_sum += val_loss.detach() * val_inputs.size(0) val_acc_sum += (val_predicted == val_labels.data).sum() val_loss_epoch = val_loss_sum.item() / len(validation_set) val_acc_epoch = val_acc_sum.item() / len(validation_set) if (epoch + 1) % 1 == 0: print( 'Epoch [{}/{}], Validation Loss:{}, Validation Accuracy:{}' .format(epoch + 1, num_epochs, val_loss_epoch, val_acc_epoch)) logs[key_words[2]].append(val_loss_epoch) logs[key_words[3]].append(val_acc_epoch) if val_loss_epoch < best_val_loss: if os.path.exists(best_saved): os.remove(best_saved) best_val_loss = val_loss_epoch best_val_loss_dict = { 'train_loss': train_loss_epoch, 'train_acc': train_acc_epoch, 'val_loss': val_loss_epoch, 'val_acc': val_acc_epoch } print('best val loss is', best_val_loss) best_mlp = copy.deepcopy(mlp) # paras = list(best_mlp.parameters()) # for num, para in enumerate(paras): # print('number:', num) # print(para) torch.save(best_mlp, best_path) print('results at minimum val loss:') print(best_val_loss_dict) log_path = 'logs/' if not os.path.exists(log_path): os.makedirs(log_path) with open(log_path + prot + '_logs_mlp.json', 'w') as fp: json.dump(logs, fp)
def train_cross_validation(k, datapath, batch_size, lr, num_epochs): # hyper parameters input_size = 600 hidden_size = 128 output_size = 2 params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 4} stage_list = [1, 2, 3, 4, 5] logs = {} best_val_loss_roc = {} for stage in stage_list: path_train = path_valid = datapath + str(k) + '/' new_train_list, new_valid_list, labels = split_train_valid(stage) print(len(new_train_list), len(new_valid_list)) print(path_train, path_valid) partition = {"train": new_train_list, "validation": new_valid_list} # Generators training_set = Dataset(partition['train'], labels, path_train) training_generator = data.DataLoader(training_set, **params) validation_set = Dataset(partition['validation'], labels, path_valid) validation_generator = data.DataLoader(validation_set, **params) print('Training data is ready') device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') mlp = mlp_model(input_size, hidden_size, output_size) mlp = mlp.to(device) print(mlp.parameters) optimizer = torch.optim.SGD(mlp.parameters(), lr=lr, momentum=0.9, weight_decay=0.00001) criterion = torch.nn.CrossEntropyLoss() key_words = [ 'train_loss_' + str(stage), 'train_accuracy_' + str(stage), 'val_loss_' + str(stage), 'val_accuracy_' + str(stage) ] logs[key_words[0]] = [] logs[key_words[1]] = [] logs[key_words[2]] = [] logs[key_words[3]] = [] for epoch in range(num_epochs): print(epoch) train_acc_sum = 0 train_loss_sum = 0.0 val_acc_sum = 0 val_loss_sum = 0.0 mlp.train() for train_inputs, train_labels in training_generator: train_inputs, train_labels = train_inputs.to( device), train_labels.to(device) train_outputs = mlp(train_inputs.float()) train_loss = criterion(train_outputs, train_labels) optimizer.zero_grad() # zero the gradient buffer train_loss.backward() optimizer.step() # trained_weight = mlp.fc1.weight.data # print(trained_weight) _, train_predicted = torch.max( train_outputs, 1) # find the max of softmax and map the predicted list train_loss_sum += train_loss.detach() * train_inputs.size(0) train_acc_sum += (train_predicted == train_labels.data ).sum() # different from CPU train_loss_epoch = train_loss_sum.item() / len(training_set) train_acc_epoch = train_acc_sum.item() / len(training_set) if (epoch + 1) % 1 == 0: print('Epoch [{}/{}], Training Loss:{}, Training Accuracy:{}'. format(epoch + 1, num_epochs, train_loss_epoch, train_acc_epoch)) # print(predicted) # print(labels) logs[key_words[0]].append(train_loss_epoch) logs[key_words[1]].append(train_acc_epoch) mlp.eval() torch.no_grad() epoch_outproba_val = np.empty((0, output_size)) epoch_labels_val = np.empty((0, output_size)) for val_inputs, val_labels in validation_generator: val_inputs, val_labels = val_inputs.to(device), val_labels.to( device) val_outputs = mlp(val_inputs.float()) val_loss = criterion(val_outputs, val_labels) _, val_predicted = torch.max( val_outputs, 1) # find the max of softmax and map the predicted list val_loss_sum += val_loss.detach() * val_inputs.size(0) val_acc_sum += (val_predicted == val_labels.data).sum() our_labels = val_labels.cpu().numpy() # print(our_labels) # print(len(our_labels)) outproba = val_outputs.cpu() outproba = outproba.detach().numpy() our_target = to_onehot(our_labels) epoch_labels_val = np.append(epoch_labels_val, our_target, axis=0) # print(epoch_labels_val) epoch_outproba_val = np.append(epoch_outproba_val, outproba, axis=0) # print(train_loss_sum.item()) # print(train_acc_sum.item()) # print(epoch_labels_val.shape) # print(epoch_outproba_val.shape) val_loss_epoch = val_loss_sum.item() / len(validation_set) val_acc_epoch = val_acc_sum.item() / len(validation_set) # print(train_loss_epoch) # print(train_acc_epoch) if (epoch + 1) % 1 == 0: print( 'Epoch [{}/{}], Validation Loss:{}, Validation Accuracy:{}' .format(epoch + 1, num_epochs, val_loss_epoch, val_acc_epoch)) # print(predicted) # print(labels) logs[key_words[2]].append(val_loss_epoch) logs[key_words[3]].append(val_acc_epoch) # print(len([epoch in range(num_epochs)])) # print(len(logs['train_loss']), len(logs['train_loss'])) model_saved = str(k) + '_mlp_graph2vec_opt_' + str(stage) + '.pt' # calculate roc score # print to list so that can be saved in .json file fpr = dict() tpr = dict() roc_auc = dict() thresholds = dict() for i in range(output_size): y_score = np.array(epoch_outproba_val[:, i]) # print(y_score) y_test = np.array(epoch_labels_val[:, i]) # print(y_test) fpr[i], tpr[i], thresholds[i] = metrics.roc_curve(y_test, y_score) # fpr_t, tpr_t, _t = metrics.roc_curve(y_test, y_score, pos_label=1) roc_auc[i] = metrics.auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area print(roc_auc[1]) best_val_loss_roc['fpr_' + str(stage)] = fpr[1].tolist() best_val_loss_roc['tpr_' + str(stage)] = tpr[1].tolist() best_val_loss_roc['thresholds_' + str(stage)] = thresholds[1].tolist() best_val_loss_roc['auc_' + str(stage)] = roc_auc[1] # best_val_loss_roc = {'fpr_' + str(stage): fpr[1].tolist(), 'tpr_'+ str(stage): tpr[1].tolist(), # 'thresholds_'+ str(stage): thresholds[1].tolist(), 'auc_'+ str(stage): roc_auc[1]} # print(best_val_loss_roc) model_path = 'model/' if model_path == None: torch.save(mlp, model_path + model_saved) else: if not os.path.exists(model_path): os.makedirs(model_path) if os.path.exists(model_saved): os.remove(model_saved) torch.save(mlp, model_path + model_saved) mm = torch.load(model_path + model_saved) print(mm.parameters) log_path = 'results/' if not os.path.exists(log_path): os.makedirs(log_path) with open(log_path + str(k) + '_logs.json', 'w') as fp: json.dump(logs, fp) with open(log_path + str(k) + '_roc.json', 'w') as f_roc: json.dump(best_val_loss_roc, f_roc)
'labels.json') ids_path = os.path.join(base_path, data_partition_name, 'ids.json') with open(label_path) as json_file: labels[data_partition_name] = json.load(json_file) with open(ids_path) as json_file: ids[data_partition_name] = json.load(json_file) exp_details = [ date.today().strftime("%d/%m/%Y"), datetime.now().strftime("H:%M:%S"), experiment_name, no_of_epochs, seed_no ] partitioned_datasets['train'] = Dataset(ids['train'], labels['train'], partition='train', base_path=base_path) partitioned_datasets['test'] = Dataset(ids['test'], labels['test'], partition='test', base_path=base_path) partitioned_datasets['val'] = Dataset(ids['val'], labels['val'], partition='val', base_path=base_path) experiment_details = {} experiment_details['exp_name'] = experiment_name experiment_details['lamda'] = lamda experiment_details['lr'] = learning_rate exp_details = [
use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") in_res = (224, 224) out_res = (224, 224) num_landmarks = 29 num_epochs = 5 batch_size = 8 xnet = FCN8sNet(in_res=in_res, num_landmarks=num_landmarks) xnet = xnet.to(device, dtype=torch.float) train_dataset = Dataset("data/cofw_annotations.json", "data/cofw/images", inres=in_res, outres=out_res, is_train=True) val_dataset = Dataset("data/cofw_annotations.json", "data/cofw/images", inres=in_res, outres=out_res, is_train=False) num_train = train_dataset.get_dataset_size() num_val = val_dataset.get_dataset_size() print('[INFO] Training size: {}'.format(num_train)) print('[INFO] Validation size: {}'.format(num_val))
def mlp_test(data_path, model_path, output_path): batch_size = 1000 n_classes = 2 with open('permu_feature_importance.json') as json_file: feature_importance = json.load(json_file) feature_list = [] score_list = [] for name, improtance in feature_importance.items(): feature_list.append(name) score_list.append(improtance) feature_tmp = feature_list[0:400] feature_sample = list(map(int, feature_tmp)) # print(len(feature_sample)) with open('test_list.pkl', 'rb') as f_test: new_test_list = pickle.load(f_test) print('The size of the test dataset', len(new_test_list)) with open('test_label.pickle', 'rb') as f_label: labels = pickle.load(f_label) test_dataset = Dataset(new_test_list, labels, data_path, feature_sample) print(data_path) test_generator = data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True) mlp = torch.load(model_path) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # data_size = len(labels) our_scores, our_target, test_accuracy = class_probabilities_test( mlp, device, test_generator, n_classes) # print(our_scores.shape, our_target.shape) fpr = dict() tpr = dict() roc_auc = dict() thresh = dict() for i in range(n_classes): y_score = np.array(our_scores[:, i]) y_test = np.array(our_target[:, i]) fpr[i], tpr[i], thresh[i] = roc_curve(y_test, y_score) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) precision = average_precision_score(y_test.ravel(), y_score.ravel()) if not os.path.exists(output_path): os.makedirs(output_path) if os.path.exists(output_path + 'roc_auc.pkl'): os.remove(output_path + 'roc_auc.pkl') if os.path.exists(output_path + "fpr.pkl"): os.remove(output_path + "fpr.pkl") if os.path.exists(output_path + "tpr.pkl"): os.remove(output_path + "tpr.pkl") if os.path.exists(output_path + "thresh.pkl"): os.remove(output_path + "thresh.pkl") # print(roc_auc) f1 = open(output_path + 'roc_auc.pkl', "wb") pickle.dump(roc_auc, f1) f1.close() f2 = open(output_path + "fpr.pkl", "wb") pickle.dump(fpr, f2) f2.close() f3 = open(output_path + "tpr.pkl", "wb") pickle.dump(tpr, f3) f3.close() f4 = open(output_path + "thresh.pkl", "wb") pickle.dump(thresh, f4) f4.close() return test_accuracy, roc_auc[1], precision
def calculate_var_imports_refits(dataset, param_grid, cond_layer_sizes, var_import_idxs=None, cv=3): """ Estimate variable importance, assumes we need to refit for each set of variable groups @param dataset: Dataset @param param_grid: dictionary to CV over, contains all values for initializing NeuralNetworkAugMTL (see docs for GridSearchCV from scikit) @param cond_layer_sizes: a list of list of network structures, each list of network structures is what we search over for estimating the reduced conditional means, ordering according to param_grid[0]["var_import_idxs"] @param cv: number of folds for Cross validation @return tuple with: 1. list of dicts, in the order of the variable groups in param_grid[0]["var_import_idxs"] Each dict contains: { "std-True": dict corresponding to naive and one-step estimates of var importance (and conf intervals) for standardized variable importance, "std-False": dict corresponding to naive and one-step estimates of var importance (and conf intervals) for not-standardized variable importance, } """ # Pick best parameters via cross validation best_params, cv_results = _get_best_params(NeuralNetworkBasic, param_grid, dataset, cv=cv) logging.info("Best params %s", str(best_params)) # Fit for the full conditional mean final_nn = NeuralNetworkBasic(**best_params) final_nn.fit(dataset.x_train, dataset.y_train) # Calculate some stats on our fitted network full_fit = final_nn.predict(dataset.x_train) r2_full = 1 - np.sum((dataset.y_train - full_fit)**2) / np.sum( (dataset.y_train - np.mean(dataset.y_train))**2) full_fit_test = final_nn.predict(dataset.x_test) r2_full_pred = 1 - np.sum((dataset.y_test - full_fit_test)**2) / np.sum( (dataset.y_test - np.mean(dataset.y_test))**2) var_imports = [] num_p = dataset.x_train.shape[1] fitted_models = { "full": final_nn.model_params, "cond": {}, "cv_results": cv_results, "cond_cv_results": {} } # set up which var importance values to calculate if not passed in if var_import_idxs is None: var_import_idxs = range(dataset.x_train.shape[1]) # Calculate some stats about our network regarding each of the variable groups # Get the estimated variable importance values for i, del_idx_group in enumerate(var_import_idxs): # Prepare dataset without the particular variables cond_x_train = np.delete(dataset.x_train, del_idx_group, axis=1) cond_x_test = np.delete(dataset.x_test, del_idx_group, axis=1) cond_dataset = Dataset(cond_x_train, dataset.y_train, dataset.y_train_true, cond_x_test, dataset.y_test, dataset.y_train_true) cond_param_grid = param_grid cond_param_grid[0]["layer_sizes"] = cond_layer_sizes[i] # Fit for reduced conditional means best_cond_params, cv_results_cond = _get_best_params( NeuralNetworkBasic, cond_param_grid, cond_dataset, cv=cv) logging.info("Best cond params %s", str(best_cond_params)) cond_nn = NeuralNetworkBasic(**best_cond_params) # Refit! cond_nn.fit(cond_x_train, dataset.y_train) fitted_models["cond"][str(del_idx_group)] = cond_nn.model_params fitted_models["cond_cv_results"][str(del_idx_group)] = cv_results_cond # Get new fitted values small_fit = cond_nn.predict(cond_x_train) small_fit_test = cond_nn.predict(cond_x_test) ## calculate R^2 r2_small = 1 - np.sum((dataset.y_train - small_fit)**2) / np.sum( (dataset.y_train - np.mean(dataset.y_train))**2) ## calculate predicted R^2 r2_small_pred = 1 - np.sum( (dataset.y_test - small_fit_test)**2) / np.sum( (dataset.y_test - np.mean(dataset.y_test))**2) logging.info("==== %s =======", str(del_idx_group)) logging.info("r2 small: %f", r2_small) logging.info("r2 small pred: %f", r2_small_pred) ## calculate estimators both standardized and unstandardized var_import_ret = {} for std in [True, False]: ests = vi.variableImportance(full_fit, small_fit, dataset.y_train, std) naive = np.array([ests[0]]) onestep = np.array([ests[1]]) ## calculate standard error for one-step onestep_se = se.variableImportanceSE(full_fit, small_fit, dataset.y_train, std) ## calculate CI for one-step onestep_ci = ci.variableImportanceCI(onestep, onestep_se, level=0.95) ret = { 'naive': np.array(naive), # naive estimate 'onestep': onestep, # one-step estimate 'onestep.se': onestep_se, # std error of one-step est 'onestep.ci': onestep_ci, # conf int for var import 'r2.full': r2_full, #R^2 for the full conditional mean on train data 'r2.small': r2_small, # R^2 for the reduced conditional mean on train data 'r2.test.full': r2_full_pred, #R^2 for the full conditional mean on test data 'r2.test.small': r2_small_pred } # R^2 for the reduced conditional mean on test data var_import_ret["std-%s" % std] = ret var_imports.append(var_import_ret) return var_imports, fitted_models
def cv_predictiveness(data, S, measure, pred_func, V=5, stratified=True, na_rm=False, type="regression", ensemble=False, run_cv=False): """ Compute a cross-validated measure of predictiveness based on the data and the chosen measure @param data: dataset @param S: the covariates to fit @param measure: measure of predictiveness @param pred_func: function that fits to the data @param V: the number of CV folds @param stratified: should the folds be stratified? @param na_rm: should we do a complete-case analysis (True) or not (False) @param type: is this regression (use predict) or classification (use predict_proba)? @param ensemble: is this an ensemble (True) or not (False)? @return cross-validated measure of predictiveness, along with preds and ics """ import numpy as np from compute_ic import compute_ic import utils as uts from data_generator import Dataset ## if na_rm = True, do a complete-case analysis if na_rm: xs = data.x_train[:, S] cc = np.sum(np.isnan(xs), axis=1) == 0 newdata = Dataset(x_train=data.x_train[cc, :], y_train=data.y_train[cc]) else: cc = np.repeat(True, data.x_train.shape[0]) newdata = data ## set up CV folds folds = uts.make_folds(newdata, V, stratified=stratified) ## do CV preds = np.empty((data.y_train.shape[0], )) preds.fill(np.nan) ics = np.empty((data.y_train.shape[0], )) ics.fill(np.nan) # preds = np.empty((newdata.y_train.shape[0],)) vs = np.empty((V, )) # ics = np.empty((newdata.y_train.shape[0],)) cc_cond = np.flatnonzero(cc) for v in range(V): fold_cond = np.flatnonzero(folds == v) x_train, y_train = newdata.x_train[folds != v, :], newdata.y_train[ folds != v] x_test, y_test = newdata.x_train[folds == v, :], newdata.y_train[folds == v] pred_func.fit(x_train[:, S], np.ravel(y_train)) if ensemble: preds_v = np.mean(pred_func.transform(x_test[:, S])) else: if type == "classification": preds_v = pred_func.predict_proba(x_test[:, S])[:, 1] else: preds_v = pred_func.predict(x_test[:, S]) preds[cc_cond[fold_cond]] = preds_v vs[v] = measure(y_test, preds_v) ics[cc_cond[fold_cond]] = compute_ic(y_test, preds_v, measure.__name__) return np.mean(vs), preds, ics, folds