예제 #1
0
파일: read_data.py 프로젝트: jjfeng/spinn
def main(args=sys.argv[1:]):
    args = parse_args()
    np.random.seed(args.seed)
    print(args)

    X, y = read_data(args.in_file, has_header=True)

    print(X.shape)
    print(y.shape)
    print(y.mean())
    y = y.reshape(y.size, 1)
    if args.center_y:
        y -= np.mean(y)
    if args.scale_y:
        y /= np.sqrt(np.var(y))
    shuffled_idx = np.random.choice(y.size, size=y.size, replace=False)
    shuff_X = X[shuffled_idx, :]
    shuff_y = y[shuffled_idx]

    n_train = y.size - int(y.size * args.test_proportion)

    train_data = Dataset(shuff_X[:n_train, :], shuff_y[:n_train, :],
                         shuff_y[:n_train, :])
    test_data = Dataset(shuff_X[n_train:, :], shuff_y[n_train:, :],
                        shuff_y[n_train:, :])

    print("data_file %s" % args.out_file)
    with open(args.out_file, "wb") as f:
        pickle.dump({"train": train_data, "test": test_data}, f)
예제 #2
0
파일: common.py 프로젝트: jjfeng/spinn
def read_input_data(args):
    import pandas as pd
    from data_generator import Dataset
    if args.data_index_file is None:
        with open(args.data_file, "rb") as f:
            all_data = pickle.load(f)
            train_data = all_data["train"]
            test_data = all_data["test"]
    else:
        with open(args.data_index_file, "rb") as f:
            data_indices = pickle.load(f)
            train_indices = data_indices["train"]
            test_indices = data_indices["test"]
            print(train_indices)
            print(test_indices)
        Xdata = pd.read_csv(args.data_X_file).values[:, 1:]
        rand_cols = np.random.choice(
            Xdata.shape[1],
            size=min(Xdata.shape[1], 5000),
            replace=False)
        Xdata = Xdata[:, rand_cols]
        print(Xdata)
        ydata = pd.read_csv(args.data_y_file).values
        print(ydata)
        train_data = Dataset(
            Xdata[train_indices, :],
            ydata[train_indices, :],
            ydata[train_indices, :])
        test_data = Dataset(
            Xdata[test_indices, :],
            ydata[test_indices, :],
            ydata[test_indices, :])
    return train_data, test_data
예제 #3
0
파일: main.py 프로젝트: dmolony3/SMITH
def train():

    tokenizer = BertWordPieceTokenizer(
        r'C:\Users\David\Documents\Machine_learning\NLP\CardioExplorer\vocab.txt',
        lowercase=True)

    args = parser.parse_args()
    batch_size = args.batch_size
    num_epochs = args.num_epochs
    file_path = args.file_path

    file_path = r'C:\Users\David\Documents\Machine_learning\NLP\CardioExplorer\abstracts_100.csv'

    if file_path is None:
        ValueError("A file path to documents must be provided")

    sentence_config, document_config = set_model_config(args, tokenizer)

    dataset = Dataset(file_path,
                      tokenizer,
                      sentence_config.max_position_embeddings,
                      document_config.max_position_embeddings,
                      mask=True)

    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            drop_last=True,
                            collate_fn=collate_fun)

    model = SmithModel(sentence_config, document_config)

    for epoch in range(num_epochs):
        for iteration, (token_ids, attention_mask, token_type_ids, label_ids,
                        split_idx) in enumerate(dataloader):
            token_ids_stacked = torch.cat(token_ids)
            label_ids_stacked = torch.cat(label_ids)
            attention_mask = torch.cat(attention_mask)
            token_type_ids = torch.cat(token_type_ids)

            output = model(input_ids=token_ids_stacked,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids,
                           split_idx=split_idx,
                           labels=label_ids_stacked)

            loss_sp = output[0]
            loss_wp = output[1]
            loss = loss_sp + loss_wp

            if iteration % 10 == 0:
                print("Iteration {}: Loss: {}".format(iteration, loss))

            loss.backward()

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
예제 #4
0
def main(args):

    print("+++ main")

    # Prepare experiment
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Create dataset
    dataset = Dataset(args.batch_size, args.nopad, device)
    train_iterator, valid_iterator, test_iterator = dataset.create_iterator()

    args.input_size = len(dataset.source_field.vocab)
    args.output_size = len(dataset.target_field.vocab)
    print(f"Source vocab size = {len(dataset.source_field.vocab)}")
    print(f"Target vocab size = {len(dataset.target_field.vocab)}")
    args.sos_idx = dataset.target_field.vocab.stoi[args.sos]
    args.eos_idx = dataset.target_field.vocab.stoi[args.eos]
    args.pad_idx = dataset.target_field.vocab.stoi[args.pad]

    # Create model
    model = create_model(args.model_type, args, device).to(device)
    init_function = create_init_function(args.init_type)
    model.apply(init_function)
    total_params = model.count_parameters()
    print(f"Total number of parameters = {total_params}")

    # Prepare training
    optimizer = create_optimizer(args.optim_type, args.learning_rate,
                                 model.parameters(), args)
    criterion = nn.CrossEntropyLoss(ignore_index=args.pad_idx)

    # Training
    training(model, train_iterator, valid_iterator, optimizer, args.optim_type,
             criterion, args.num_epochs, args.clip, args.nopad, device)

    print("--- main")
def return_data_set_generator():

    base_path = '../data/processed_poke_3'
    data_partition_name = 'test'
    label_path = os.path.join(base_path, data_partition_name, 'labels.json')
    ids_path = os.path.join(base_path, data_partition_name, 'ids.json')

    with open(label_path) as json_file:
        labels = json.load(json_file)

    with open(ids_path) as json_file:
        ids = json.load(json_file)
    test_dataset = Dataset(ids, labels, partition='test', base_path=base_path)

    params = {'batch_size': 1, 'shuffle': True, 'num_workers': 1}

    test_data_generator = data.DataLoader(test_dataset, **params)
    return test_data_generator
예제 #6
0
sentence_config.vocab_size = tokenizer.get_vocab_size()
sentence_config.num_hidden_layers = 6
sentence_config.hidden_size = 256
sentence_config.num_attention_heads = 4
sentence_config.max_position_embeddings = sentence_block_length  # sentence_block_length

document_config = BertConfig()
document_config.vocab_size = tokenizer.get_vocab_size()
document_config.num_hidden_layers = 3
document_config.hidden_size = 256
document_config.num_attention_heads = 4
document_config.max_position_embeddings = max_sentence_blocks  # sentence_block_length

dataset = Dataset(file_path,
                  tokenizer,
                  sentence_block_length,
                  max_sentence_blocks,
                  mask=True)
dataloader = DataLoader(dataset,
                        batch_size=batch_size,
                        shuffle=shuffle,
                        drop_last=drop_last,
                        collate_fn=collate_fun)

#sentence_model = AutoModel.from_config(sentence_config)
#document_model = AutoModel.from_config(document_config)
sentence_model = BertForMaskedLM(sentence_config)
document_model = BertModel(document_config)

dense1 = torch.nn.Linear(sentence_config.hidden_size,
                         sentence_config.hidden_size)
예제 #7
0
def train_generator_cross_valid(datapath, batch_size, lr, num_epochs, output,
                                prot):

    input_size = 400
    hidden_size = 128
    output_size = 2
    # load selected features list
    with open('permu_feature_importance.json') as json_file:
        feature_importance = json.load(json_file)
    feature_list = []
    score_list = []
    for name, improtance in feature_importance.items():
        feature_list.append(name)
        score_list.append(improtance)

    feature_tmp = feature_list[0:400]
    feature_sample = list(map(int, feature_tmp))

    params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 4}

    stage_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    logs = {}

    for stage in stage_list:
        path_train = path_valid = datapath
        new_train_list, new_valid_list, labels = split_train_valid(stage, prot)
        print(len(new_train_list), len(new_valid_list))
        print(path_train, path_valid)
        partition = {"train": new_train_list, "validation": new_valid_list}
        # Generators
        training_set = Dataset(partition['train'], labels, path_train,
                               feature_sample)
        training_generator = data.DataLoader(training_set, **params)
        validation_set = Dataset(partition['validation'], labels, path_valid,
                                 feature_sample)
        validation_generator = data.DataLoader(validation_set, **params)
        print('Training data is ready')

        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        mlp = mlp_model(input_size, hidden_size, output_size)
        mlp = mlp.to(device)
        print(mlp.parameters)
        # optimizer = torch.optim.Adam(mlp.parameters(), lr=lr, weight_decay=0.0001)
        optimizer = torch.optim.SGD(mlp.parameters(),
                                    lr=lr,
                                    momentum=0.9,
                                    weight_decay=0.00001)
        # optimizer = torch.optim.Adam(mlp.parameters(), lr=lr)
        criterion = torch.nn.CrossEntropyLoss()

        key_words = [
            'train_loss_' + str(stage), 'train_accuracy_' + str(stage),
            'val_loss_' + str(stage), 'val_accuracy_' + str(stage)
        ]
        logs[key_words[0]] = []
        logs[key_words[1]] = []
        logs[key_words[2]] = []
        logs[key_words[3]] = []
        best_val_loss = 9999999

        if not os.path.exists(output):
            os.makedirs(output)
        best_saved = str(stage) + '_mlp_supertarget_' + prot + '.pt'
        best_path = output + best_saved
        for epoch in range(num_epochs):
            print(epoch)
            train_acc_sum = 0
            train_loss_sum = 0.0
            val_acc_sum = 0
            val_loss_sum = 0.0
            mlp.train()

            for train_inputs, train_labels in training_generator:
                train_inputs, train_labels = train_inputs.to(
                    device), train_labels.to(device)
                train_outputs = mlp(train_inputs.float())
                train_loss = criterion(train_outputs, train_labels)
                optimizer.zero_grad()  # zero the gradient buffer
                train_loss.backward()
                optimizer.step()
                _, train_predicted = torch.max(
                    train_outputs,
                    1)  # find the max of softmax and map the predicted list
                train_loss_sum += train_loss.detach() * train_inputs.size(0)
                train_acc_sum += (train_predicted == train_labels.data
                                  ).sum()  # different from CPU
            # print(train_loss_sum.item())
            # print(train_acc_sum.item())
            train_loss_epoch = train_loss_sum.item() / len(training_set)
            train_acc_epoch = train_acc_sum.item() / len(training_set)
            if (epoch + 1) % 1 == 0:
                print('Epoch [{}/{}],  Training Loss:{}, Training Accuracy:{}'.
                      format(epoch + 1, num_epochs, train_loss_epoch,
                             train_acc_epoch))
            logs[key_words[0]].append(train_loss_epoch)
            logs[key_words[1]].append(train_acc_epoch)

            mlp.eval()
            torch.no_grad()
            for val_inputs, val_labels in validation_generator:
                val_inputs, val_labels = val_inputs.to(device), val_labels.to(
                    device)
                val_outputs = mlp(val_inputs.float())
                val_loss = criterion(val_outputs, val_labels)
                _, val_predicted = torch.max(
                    val_outputs,
                    1)  # find the max of softmax and map the predicted list
                val_loss_sum += val_loss.detach() * val_inputs.size(0)
                val_acc_sum += (val_predicted == val_labels.data).sum()
            val_loss_epoch = val_loss_sum.item() / len(validation_set)
            val_acc_epoch = val_acc_sum.item() / len(validation_set)
            if (epoch + 1) % 1 == 0:
                print(
                    'Epoch [{}/{}],  Validation Loss:{}, Validation Accuracy:{}'
                    .format(epoch + 1, num_epochs, val_loss_epoch,
                            val_acc_epoch))
            logs[key_words[2]].append(val_loss_epoch)
            logs[key_words[3]].append(val_acc_epoch)

            if val_loss_epoch < best_val_loss:
                if os.path.exists(best_saved):
                    os.remove(best_saved)
                best_val_loss = val_loss_epoch
                best_val_loss_dict = {
                    'train_loss': train_loss_epoch,
                    'train_acc': train_acc_epoch,
                    'val_loss': val_loss_epoch,
                    'val_acc': val_acc_epoch
                }
                print('best val loss is', best_val_loss)
                best_mlp = copy.deepcopy(mlp)
                # paras = list(best_mlp.parameters())
                # for num, para in enumerate(paras):
                #     print('number:', num)
                #     print(para)
                torch.save(best_mlp, best_path)

        print('results at minimum val loss:')
        print(best_val_loss_dict)

    log_path = 'logs/'
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    with open(log_path + prot + '_logs_mlp.json', 'w') as fp:
        json.dump(logs, fp)
def train_cross_validation(k, datapath, batch_size, lr, num_epochs):

    # hyper parameters
    input_size = 600
    hidden_size = 128
    output_size = 2

    params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 4}

    stage_list = [1, 2, 3, 4, 5]
    logs = {}
    best_val_loss_roc = {}
    for stage in stage_list:
        path_train = path_valid = datapath + str(k) + '/'
        new_train_list, new_valid_list, labels = split_train_valid(stage)
        print(len(new_train_list), len(new_valid_list))
        print(path_train, path_valid)
        partition = {"train": new_train_list, "validation": new_valid_list}
        # Generators
        training_set = Dataset(partition['train'], labels, path_train)
        training_generator = data.DataLoader(training_set, **params)
        validation_set = Dataset(partition['validation'], labels, path_valid)
        validation_generator = data.DataLoader(validation_set, **params)
        print('Training data is ready')

        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        mlp = mlp_model(input_size, hidden_size, output_size)
        mlp = mlp.to(device)
        print(mlp.parameters)
        optimizer = torch.optim.SGD(mlp.parameters(),
                                    lr=lr,
                                    momentum=0.9,
                                    weight_decay=0.00001)
        criterion = torch.nn.CrossEntropyLoss()

        key_words = [
            'train_loss_' + str(stage), 'train_accuracy_' + str(stage),
            'val_loss_' + str(stage), 'val_accuracy_' + str(stage)
        ]
        logs[key_words[0]] = []
        logs[key_words[1]] = []
        logs[key_words[2]] = []
        logs[key_words[3]] = []

        for epoch in range(num_epochs):
            print(epoch)
            train_acc_sum = 0
            train_loss_sum = 0.0
            val_acc_sum = 0
            val_loss_sum = 0.0
            mlp.train()
            for train_inputs, train_labels in training_generator:
                train_inputs, train_labels = train_inputs.to(
                    device), train_labels.to(device)
                train_outputs = mlp(train_inputs.float())
                train_loss = criterion(train_outputs, train_labels)
                optimizer.zero_grad()  # zero the gradient buffer
                train_loss.backward()
                optimizer.step()
                # trained_weight = mlp.fc1.weight.data
                # print(trained_weight)
                _, train_predicted = torch.max(
                    train_outputs,
                    1)  # find the max of softmax and map the predicted list
                train_loss_sum += train_loss.detach() * train_inputs.size(0)
                train_acc_sum += (train_predicted == train_labels.data
                                  ).sum()  # different from CPU

            train_loss_epoch = train_loss_sum.item() / len(training_set)
            train_acc_epoch = train_acc_sum.item() / len(training_set)
            if (epoch + 1) % 1 == 0:
                print('Epoch [{}/{}],  Training Loss:{}, Training Accuracy:{}'.
                      format(epoch + 1, num_epochs, train_loss_epoch,
                             train_acc_epoch))
                # print(predicted)
                # print(labels)
            logs[key_words[0]].append(train_loss_epoch)
            logs[key_words[1]].append(train_acc_epoch)

            mlp.eval()
            torch.no_grad()
            epoch_outproba_val = np.empty((0, output_size))
            epoch_labels_val = np.empty((0, output_size))
            for val_inputs, val_labels in validation_generator:
                val_inputs, val_labels = val_inputs.to(device), val_labels.to(
                    device)
                val_outputs = mlp(val_inputs.float())
                val_loss = criterion(val_outputs, val_labels)
                _, val_predicted = torch.max(
                    val_outputs,
                    1)  # find the max of softmax and map the predicted list
                val_loss_sum += val_loss.detach() * val_inputs.size(0)
                val_acc_sum += (val_predicted == val_labels.data).sum()
                our_labels = val_labels.cpu().numpy()
                # print(our_labels)
                # print(len(our_labels))
                outproba = val_outputs.cpu()
                outproba = outproba.detach().numpy()
                our_target = to_onehot(our_labels)
                epoch_labels_val = np.append(epoch_labels_val,
                                             our_target,
                                             axis=0)
                # print(epoch_labels_val)
                epoch_outproba_val = np.append(epoch_outproba_val,
                                               outproba,
                                               axis=0)
            # print(train_loss_sum.item())
            # print(train_acc_sum.item())
            # print(epoch_labels_val.shape)
            # print(epoch_outproba_val.shape)
            val_loss_epoch = val_loss_sum.item() / len(validation_set)
            val_acc_epoch = val_acc_sum.item() / len(validation_set)
            # print(train_loss_epoch)
            # print(train_acc_epoch)
            if (epoch + 1) % 1 == 0:
                print(
                    'Epoch [{}/{}],  Validation Loss:{}, Validation Accuracy:{}'
                    .format(epoch + 1, num_epochs, val_loss_epoch,
                            val_acc_epoch))
                # print(predicted)
                # print(labels)
            logs[key_words[2]].append(val_loss_epoch)
            logs[key_words[3]].append(val_acc_epoch)
        # print(len([epoch in range(num_epochs)]))
        # print(len(logs['train_loss']), len(logs['train_loss']))
        model_saved = str(k) + '_mlp_graph2vec_opt_' + str(stage) + '.pt'
        # calculate roc score
        # print to list so that can be saved in .json file
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        thresholds = dict()
        for i in range(output_size):
            y_score = np.array(epoch_outproba_val[:, i])
            # print(y_score)
            y_test = np.array(epoch_labels_val[:, i])
            # print(y_test)
            fpr[i], tpr[i], thresholds[i] = metrics.roc_curve(y_test, y_score)
            # fpr_t, tpr_t, _t = metrics.roc_curve(y_test, y_score, pos_label=1)
            roc_auc[i] = metrics.auc(fpr[i], tpr[i])
        # Compute micro-average ROC curve and ROC area
        print(roc_auc[1])
        best_val_loss_roc['fpr_' + str(stage)] = fpr[1].tolist()
        best_val_loss_roc['tpr_' + str(stage)] = tpr[1].tolist()
        best_val_loss_roc['thresholds_' + str(stage)] = thresholds[1].tolist()
        best_val_loss_roc['auc_' + str(stage)] = roc_auc[1]
        # best_val_loss_roc = {'fpr_' + str(stage): fpr[1].tolist(), 'tpr_'+ str(stage): tpr[1].tolist(),
        #                      'thresholds_'+ str(stage): thresholds[1].tolist(), 'auc_'+ str(stage): roc_auc[1]}
        # print(best_val_loss_roc)
        model_path = 'model/'
        if model_path == None:
            torch.save(mlp, model_path + model_saved)
        else:
            if not os.path.exists(model_path):
                os.makedirs(model_path)
            if os.path.exists(model_saved):
                os.remove(model_saved)
            torch.save(mlp, model_path + model_saved)
            mm = torch.load(model_path + model_saved)
            print(mm.parameters)

    log_path = 'results/'
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    with open(log_path + str(k) + '_logs.json', 'w') as fp:
        json.dump(logs, fp)
    with open(log_path + str(k) + '_roc.json', 'w') as f_roc:
        json.dump(best_val_loss_roc, f_roc)
                                  'labels.json')
        ids_path = os.path.join(base_path, data_partition_name, 'ids.json')

        with open(label_path) as json_file:
            labels[data_partition_name] = json.load(json_file)

        with open(ids_path) as json_file:
            ids[data_partition_name] = json.load(json_file)
    exp_details = [
        date.today().strftime("%d/%m/%Y"),
        datetime.now().strftime("H:%M:%S"), experiment_name, no_of_epochs,
        seed_no
    ]

    partitioned_datasets['train'] = Dataset(ids['train'],
                                            labels['train'],
                                            partition='train',
                                            base_path=base_path)
    partitioned_datasets['test'] = Dataset(ids['test'],
                                           labels['test'],
                                           partition='test',
                                           base_path=base_path)
    partitioned_datasets['val'] = Dataset(ids['val'],
                                          labels['val'],
                                          partition='val',
                                          base_path=base_path)
    experiment_details = {}
    experiment_details['exp_name'] = experiment_name
    experiment_details['lamda'] = lamda
    experiment_details['lr'] = learning_rate

    exp_details = [
예제 #10
0
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

in_res = (224, 224)
out_res = (224, 224)
num_landmarks = 29
num_epochs = 5
batch_size = 8

xnet = FCN8sNet(in_res=in_res, num_landmarks=num_landmarks)
xnet = xnet.to(device, dtype=torch.float)

train_dataset = Dataset("data/cofw_annotations.json",
                        "data/cofw/images",
                        inres=in_res,
                        outres=out_res,
                        is_train=True)

val_dataset = Dataset("data/cofw_annotations.json",
                      "data/cofw/images",
                      inres=in_res,
                      outres=out_res,
                      is_train=False)

num_train = train_dataset.get_dataset_size()
num_val = val_dataset.get_dataset_size()

print('[INFO] Training size: {}'.format(num_train))
print('[INFO] Validation size: {}'.format(num_val))
예제 #11
0
def mlp_test(data_path, model_path, output_path):

    batch_size = 1000
    n_classes = 2

    with open('permu_feature_importance.json') as json_file:
        feature_importance = json.load(json_file)
    feature_list = []
    score_list = []
    for name, improtance in feature_importance.items():
        feature_list.append(name)
        score_list.append(improtance)

    feature_tmp = feature_list[0:400]
    feature_sample = list(map(int, feature_tmp))
    # print(len(feature_sample))

    with open('test_list.pkl', 'rb') as f_test:
        new_test_list = pickle.load(f_test)
    print('The size of the test dataset', len(new_test_list))
    with open('test_label.pickle', 'rb') as f_label:
        labels = pickle.load(f_label)

    test_dataset = Dataset(new_test_list, labels, data_path, feature_sample)
    print(data_path)
    test_generator = data.DataLoader(dataset=test_dataset,
                                     batch_size=batch_size,
                                     shuffle=True)
    mlp = torch.load(model_path)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # data_size = len(labels)
    our_scores, our_target, test_accuracy = class_probabilities_test(
        mlp, device, test_generator, n_classes)
    # print(our_scores.shape, our_target.shape)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    thresh = dict()
    for i in range(n_classes):
        y_score = np.array(our_scores[:, i])
        y_test = np.array(our_target[:, i])
        fpr[i], tpr[i], thresh[i] = roc_curve(y_test, y_score)
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    precision = average_precision_score(y_test.ravel(), y_score.ravel())

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    if os.path.exists(output_path + 'roc_auc.pkl'):
        os.remove(output_path + 'roc_auc.pkl')
    if os.path.exists(output_path + "fpr.pkl"):
        os.remove(output_path + "fpr.pkl")
    if os.path.exists(output_path + "tpr.pkl"):
        os.remove(output_path + "tpr.pkl")
    if os.path.exists(output_path + "thresh.pkl"):
        os.remove(output_path + "thresh.pkl")
    # print(roc_auc)
    f1 = open(output_path + 'roc_auc.pkl', "wb")
    pickle.dump(roc_auc, f1)
    f1.close()
    f2 = open(output_path + "fpr.pkl", "wb")
    pickle.dump(fpr, f2)
    f2.close()
    f3 = open(output_path + "tpr.pkl", "wb")
    pickle.dump(tpr, f3)
    f3.close()
    f4 = open(output_path + "thresh.pkl", "wb")
    pickle.dump(thresh, f4)
    f4.close()

    return test_accuracy, roc_auc[1], precision
def calculate_var_imports_refits(dataset,
                                 param_grid,
                                 cond_layer_sizes,
                                 var_import_idxs=None,
                                 cv=3):
    """
    Estimate variable importance, assumes we need to refit for each set of variable groups

    @param dataset: Dataset
    @param param_grid: dictionary to CV over, contains all values for initializing NeuralNetworkAugMTL
                    (see docs for GridSearchCV from scikit)
    @param cond_layer_sizes: a list of list of network structures, each list of network structures is what we search over
                            for estimating the reduced conditional means, ordering according to param_grid[0]["var_import_idxs"]
    @param cv: number of folds for Cross validation
    @return tuple with:
        1. list of dicts, in the order of the variable groups in param_grid[0]["var_import_idxs"]
            Each dict contains: {
                "std-True": dict corresponding to naive and one-step estimates of var importance (and conf intervals) for standardized variable importance,
                "std-False": dict corresponding to naive and one-step estimates of var importance (and conf intervals) for not-standardized variable importance,
            }
            
    """
    # Pick best parameters via cross validation
    best_params, cv_results = _get_best_params(NeuralNetworkBasic,
                                               param_grid,
                                               dataset,
                                               cv=cv)
    logging.info("Best params %s", str(best_params))

    # Fit for the full conditional mean
    final_nn = NeuralNetworkBasic(**best_params)
    final_nn.fit(dataset.x_train, dataset.y_train)

    # Calculate some stats on our fitted network
    full_fit = final_nn.predict(dataset.x_train)
    r2_full = 1 - np.sum((dataset.y_train - full_fit)**2) / np.sum(
        (dataset.y_train - np.mean(dataset.y_train))**2)
    full_fit_test = final_nn.predict(dataset.x_test)
    r2_full_pred = 1 - np.sum((dataset.y_test - full_fit_test)**2) / np.sum(
        (dataset.y_test - np.mean(dataset.y_test))**2)

    var_imports = []
    num_p = dataset.x_train.shape[1]
    fitted_models = {
        "full": final_nn.model_params,
        "cond": {},
        "cv_results": cv_results,
        "cond_cv_results": {}
    }

    # set up which var importance values to calculate if not passed in
    if var_import_idxs is None:
        var_import_idxs = range(dataset.x_train.shape[1])

    # Calculate some stats about our network regarding each of the variable groups
    # Get the estimated variable importance values
    for i, del_idx_group in enumerate(var_import_idxs):
        # Prepare dataset without the particular variables
        cond_x_train = np.delete(dataset.x_train, del_idx_group, axis=1)
        cond_x_test = np.delete(dataset.x_test, del_idx_group, axis=1)
        cond_dataset = Dataset(cond_x_train, dataset.y_train,
                               dataset.y_train_true, cond_x_test,
                               dataset.y_test, dataset.y_train_true)
        cond_param_grid = param_grid
        cond_param_grid[0]["layer_sizes"] = cond_layer_sizes[i]

        # Fit for reduced conditional means
        best_cond_params, cv_results_cond = _get_best_params(
            NeuralNetworkBasic, cond_param_grid, cond_dataset, cv=cv)
        logging.info("Best cond params %s", str(best_cond_params))
        cond_nn = NeuralNetworkBasic(**best_cond_params)

        # Refit!
        cond_nn.fit(cond_x_train, dataset.y_train)
        fitted_models["cond"][str(del_idx_group)] = cond_nn.model_params
        fitted_models["cond_cv_results"][str(del_idx_group)] = cv_results_cond

        # Get new fitted values
        small_fit = cond_nn.predict(cond_x_train)
        small_fit_test = cond_nn.predict(cond_x_test)

        ## calculate R^2
        r2_small = 1 - np.sum((dataset.y_train - small_fit)**2) / np.sum(
            (dataset.y_train - np.mean(dataset.y_train))**2)

        ## calculate predicted R^2
        r2_small_pred = 1 - np.sum(
            (dataset.y_test - small_fit_test)**2) / np.sum(
                (dataset.y_test - np.mean(dataset.y_test))**2)
        logging.info("==== %s =======", str(del_idx_group))
        logging.info("r2 small: %f", r2_small)
        logging.info("r2 small pred: %f", r2_small_pred)

        ## calculate estimators both standardized and unstandardized
        var_import_ret = {}
        for std in [True, False]:
            ests = vi.variableImportance(full_fit, small_fit, dataset.y_train,
                                         std)
            naive = np.array([ests[0]])
            onestep = np.array([ests[1]])

            ## calculate standard error for one-step
            onestep_se = se.variableImportanceSE(full_fit, small_fit,
                                                 dataset.y_train, std)

            ## calculate CI for one-step
            onestep_ci = ci.variableImportanceCI(onestep,
                                                 onestep_se,
                                                 level=0.95)

            ret = {
                'naive': np.array(naive),  # naive estimate
                'onestep': onestep,  # one-step estimate
                'onestep.se': onestep_se,  # std error of one-step est
                'onestep.ci': onestep_ci,  # conf int for var import
                'r2.full':
                r2_full,  #R^2 for the full conditional mean on train data
                'r2.small':
                r2_small,  # R^2 for the reduced conditional mean  on train data
                'r2.test.full':
                r2_full_pred,  #R^2 for the full conditional mean on test data
                'r2.test.small': r2_small_pred
            }  # R^2 for the reduced conditional mean  on test data
            var_import_ret["std-%s" % std] = ret
        var_imports.append(var_import_ret)
    return var_imports, fitted_models
예제 #13
0
def cv_predictiveness(data,
                      S,
                      measure,
                      pred_func,
                      V=5,
                      stratified=True,
                      na_rm=False,
                      type="regression",
                      ensemble=False,
                      run_cv=False):
    """
    Compute a cross-validated measure of predictiveness based on the data
    and the chosen measure

    @param data: dataset
    @param S: the covariates to fit
    @param measure: measure of predictiveness
    @param pred_func: function that fits to the data
    @param V: the number of CV folds
    @param stratified: should the folds be stratified?
    @param na_rm: should we do a complete-case analysis (True) or not (False)
    @param type: is this regression (use predict) or classification (use predict_proba)?
    @param ensemble: is this an ensemble (True) or not (False)?

    @return cross-validated measure of predictiveness, along with preds and ics
    """
    import numpy as np
    from compute_ic import compute_ic
    import utils as uts
    from data_generator import Dataset
    ## if na_rm = True, do a complete-case analysis
    if na_rm:
        xs = data.x_train[:, S]
        cc = np.sum(np.isnan(xs), axis=1) == 0
        newdata = Dataset(x_train=data.x_train[cc, :],
                          y_train=data.y_train[cc])
    else:
        cc = np.repeat(True, data.x_train.shape[0])
        newdata = data
    ## set up CV folds
    folds = uts.make_folds(newdata, V, stratified=stratified)
    ## do CV
    preds = np.empty((data.y_train.shape[0], ))
    preds.fill(np.nan)
    ics = np.empty((data.y_train.shape[0], ))
    ics.fill(np.nan)
    # preds = np.empty((newdata.y_train.shape[0],))
    vs = np.empty((V, ))
    # ics = np.empty((newdata.y_train.shape[0],))
    cc_cond = np.flatnonzero(cc)
    for v in range(V):
        fold_cond = np.flatnonzero(folds == v)
        x_train, y_train = newdata.x_train[folds != v, :], newdata.y_train[
            folds != v]
        x_test, y_test = newdata.x_train[folds == v, :], newdata.y_train[folds
                                                                         == v]
        pred_func.fit(x_train[:, S], np.ravel(y_train))
        if ensemble:
            preds_v = np.mean(pred_func.transform(x_test[:, S]))
        else:
            if type == "classification":
                preds_v = pred_func.predict_proba(x_test[:, S])[:, 1]
            else:
                preds_v = pred_func.predict(x_test[:, S])
        preds[cc_cond[fold_cond]] = preds_v
        vs[v] = measure(y_test, preds_v)
        ics[cc_cond[fold_cond]] = compute_ic(y_test, preds_v, measure.__name__)
    return np.mean(vs), preds, ics, folds