Exemplo n.º 1
0
def train(args, params):

    print("Loading training set...")
    train = load.load_dataset(params['train'])
    print("Loading dev set...")
    dev = load.load_dataset(params['dev'])
    print("Building preprocessor...")
    preproc = load.Preproc(*train)
    print("train_set_classes:", preproc.classes)
    print("Training size: " + str(len(train[0])) + " examples.")
    print("Dev size: " + str(len(dev[0])) + " examples.")

    save_dir = make_save_dir(params['save_dir'], args.experiment)

    util.save(preproc, save_dir)

    params.update({
        "input_shape": [None, 1],
        "num_categories": len(preproc.classes)
    })

    model = network.build_network(**params)

    stopping = keras.callbacks.EarlyStopping(patience=30)

    reduce_lr = keras.callbacks.ReduceLROnPlateau(
        factor=0.1, patience=2, min_lr=params["learning_rate"] * 0.001)

    checkpointer = keras.callbacks.ModelCheckpoint(
        filepath=get_filename_for_saving(save_dir), save_best_only=False)

    batch_size = params.get("batch_size", 32)

    # summary = str(model.summary(print_fn=lambda x: fh.write(x + '\n')))
    # out = open("/content/ecg/report.txt",'w')
    # out.write(summary)
    # out.close

    if params.get("generator", False):
        train_gen = load.data_generator(batch_size, preproc, *train)
        dev_gen = load.data_generator(batch_size, preproc, *dev)
        model.fit_generator(train_gen,
                            steps_per_epoch=int(len(train[0]) / batch_size),
                            epochs=MAX_EPOCHS,
                            validation_data=dev_gen,
                            validation_steps=int(len(dev[0]) / batch_size),
                            callbacks=[checkpointer, reduce_lr, stopping])
        # util.learning_curve(history)

    else:
        train_x, train_y = preproc.process(*train)
        dev_x, dev_y = preproc.process(*dev)
        model.fit(train_x,
                  train_y,
                  batch_size=batch_size,
                  epochs=MAX_EPOCHS,
                  validation_data=(dev_x, dev_y),
                  callbacks=[checkpointer, reduce_lr, stopping])
Exemplo n.º 2
0
def train(args, params):

    print("Loading training set...")
    train = load.load_dataset(params['train'])
    print("Loading dev set...")
    dev = load.load_dataset(params['dev'])
    print("Building preprocessor...")
    preproc = load.Preproc(*train)
    print("Training size: " + str(len(train[0])) + " examples.")
    print("Dev size: " + str(len(dev[0])) + " examples.")

    save_dir = make_save_dir(params['save_dir'], args.experiment)

    util.save(preproc, save_dir)

    params.update({
        "input_shape": [None, 1],
        "num_categories": len(preproc.classes)
    })
    print(params)

    model = network.build_network(**params)

    stopping = keras.callbacks.EarlyStopping(patience=8)

    reduce_lr = keras.callbacks.ReduceLROnPlateau(
        factor=0.1, patience=2, min_lr=params["learning_rate"] * 0.001)

    checkpointer = keras.callbacks.ModelCheckpoint(
        filepath=get_filename_for_saving(save_dir), save_best_only=False)
    ckpt_best = keras.callbacks.ModelCheckpoint(os.path.join(
        save_dir, 'best.hdf5'),
                                                save_best_only=True)

    batch_size = params.get("batch_size", 32)

    if params.get("generator", False):
        train_gen = load.data_generator(batch_size, preproc, *train)
        dev_gen = load.data_generator(batch_size, preproc, *dev)
        model.fit_generator(
            train_gen,
            steps_per_epoch=int(len(train[0]) / batch_size),
            epochs=MAX_EPOCHS,
            validation_data=dev_gen,
            validation_steps=int(len(dev[0]) / batch_size),
            callbacks=[checkpointer, ckpt_best, reduce_lr, stopping])
    else:
        train_x, train_y = preproc.process(*train)
        dev_x, dev_y = preproc.process(*dev)
        model.fit(train_x,
                  train_y,
                  batch_size=batch_size,
                  epochs=MAX_EPOCHS,
                  validation_data=(dev_x, dev_y),
                  callbacks=[checkpointer, ckpt_best, reduce_lr, stopping])
Exemplo n.º 3
0
def train(args, params):

    print("Loading training set...")
    train = load.load_dataset(params['train'])
    print("Loading dev set...")
    dev = load.load_dataset(params['dev'])
    print("Building preprocessor...")
    preproc = load.Preproc(*train)
    print("Training size: " + str(len(train[0])) + " examples.")
    print("Dev size: " + str(len(dev[0])) + " examples.")


    save_dir = make_save_dir(params['save_dir'], args.experiment)

    util.save(preproc, save_dir)

    params.update({
        "input_shape": [None, 1],
        "num_categories": len(preproc.classes)
    })

    model = network.build_network(**params)

    stopping = keras.callbacks.EarlyStopping(patience=8)

    reduce_lr = keras.callbacks.ReduceLROnPlateau(
        factor=0.1,
        patience=2,
        min_lr=params["learning_rate"] * 0.001)

    checkpointer = keras.callbacks.ModelCheckpoint(
        filepath=get_filename_for_saving(save_dir),
        save_best_only=False)

    batch_size = params.get("batch_size", 32)

    if params.get("generator", False):
        train_gen = load.data_generator(batch_size, preproc, *train)
        dev_gen = load.data_generator(batch_size, preproc, *dev)
        model.fit_generator(
            train_gen,
            steps_per_epoch=int(len(train[0]) / batch_size),
            epochs=MAX_EPOCHS,
            validation_data=dev_gen,
            validation_steps=int(len(dev[0]) / batch_size),
            callbacks=[checkpointer, reduce_lr, stopping])
    else:
        train_x, train_y = preproc.process(*train)
        dev_x, dev_y = preproc.process(*dev)
        model.fit(
            train_x, train_y,
            batch_size=batch_size,
            epochs=MAX_EPOCHS,
            validation_data=(dev_x, dev_y),
            callbacks=[checkpointer, reduce_lr, stopping])
Exemplo n.º 4
0
def predict(data_json, model_path):
    preproc = util.load(os.path.dirname(model_path))
    dataset = load.load_dataset(data_json)

    x, y = preproc.process(*dataset)
    y_test = []
    for e, i in enumerate(dataset[1]):
        for j in range(len(i)):
            y_test.append(y[e, j, :])
    y_result = np.array(y_test)

    model = keras.models.load_model(model_path)
    probs = model.predict(x, verbose=1)
    #update start
    y_test = []
    y_predict = []
    for e, i in enumerate(dataset[1]):
        for j in range(len(i)):
            y_test.append(y[e, j, :])
            y_predict.append(probs[e, j, :])
    y_test = np.array(y_test)
    y_predict = np.array(y_predict)
    #update stop

    return y_test, y_predict
Exemplo n.º 5
0
def eval_model(model_path):
    """
    测试model准确率
    """

    eval_data, eval_labels = load_dataset(PATH)

    # 创建模型
    cifar10_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
                                                model_dir=model_path)

    # 评估模型和输出结果
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(x=eval_data,
                                                       y=eval_labels,
                                                       num_epochs=1,
                                                       shuffle=True)

    eval_results = cifar10_classifier.evaluate(input_fn=eval_input_fn)

    print("----------------------------------------\n\
    总共训练步数:{g_step}\n\
    测试图片数量: {num}\n\
    loss 值: {loss:0.4f}\n\
    识别准确率: {accuracy:0.2f}%\
    \n----------------------------------------\n".format(
        g_step=eval_results["global_step"],
        loss=eval_results["loss"],
        num=eval_data.shape[0],
        accuracy=eval_results["accuracy"] * 100))
def do_confusion():
    ''' generate and print a cross-validated confusion matrix'''
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.cross_validation import cross_val_score
    from sklearn.cross_validation import KFold
    from sklearn.metrics import confusion_matrix
    
    # load data
    features, labels = load_dataset('seeds')
    
    # create a sklearn knn classifier
    classifier = KNeighborsClassifier(n_neighbors = 4)
    
    # create a pipeline with prescaler + classifier
    classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
    
    kf = KFold(len(features), n_folds = 3, shuffle=True)
    
    names = list(set(labels))
    labels = np.array([names.index(ell) for ell in labels])
    preds = labels.copy()
    preds[:] = -1
    for train, test in kf:
        classifier.fit(features[train], labels[train])
        preds[test] = classifier.predict(features[test])
        
    cmat = confusion_matrix(labels, preds)
    print('Confusion matrix [rows represent true outcome, columns = predicted outcome]')
    print(cmat)
    
    # the explicit float() conversion is necessary in Python 2 (otherwise, result is rounded to 0)
    acc = cmat.trace()/float(cmat.sum())
    print('Accuracy: {0:.1%}'.format(acc))
Exemplo n.º 7
0
def predict(data_json, model_path):
    preproc = util.load(os.path.dirname(model_path))
    dataset = load.load_dataset(data_json)
    x, y = preproc.process(*dataset)

    model = keras.models.load_model(model_path)
    probs = model.predict(x, verbose=1)

    return probs
Exemplo n.º 8
0
def predict(data_json, model_path):
    preproc = util.load(os.path.dirname(model_path))
    dataset = load.load_dataset(data_json)
    x, y = preproc.process(*dataset)

    model = keras.models.load_model(model_path)
    probs = model.predict(x, verbose=1)

    return probs
Exemplo n.º 9
0
def predict(data_json, model_path):
    preproc = util.load(os.path.dirname(model_path))
    dataset = load.load_dataset(data_json)
    x, y = preproc.process(*dataset)

    model = keras.models.load_model(model_path)
    probs = model.predict(x, verbose=1)

    #cj add use for debug
    predict = np.argmax(probs, axis=2)  #axis = 0是取行的最大值的索引,1是列的最大值的索引

    return probs
Exemplo n.º 10
0
    def load_data(self):
        data_dir = self.config.data_dir
        batch_size = self.config.batch_size
        x_train, y_train, x_val, y_val, x_test, y_test = l.load_dataset(
            data_dir)
        print("data loaded successfully...")

        # number of iterations to go through entire training set
        self.train_data = {'x': x_train, 'y': y_train}
        self.train_iterations_per_epoch = (x_train.shape[0] + batch_size -
                                           1) // batch_size

        print("x_training shape : ", x_train.shape[0])
        print("y_training shape : ", y_train.shape[0])
        print("num of iterations on training data in one epoch : ",
              self.train_iterations_per_epoch)

        #####################################################

        self.val_data = {'x': x_val, 'y': y_val}

        self.val_iterations_per_epoch = (x_val.shape[0] + batch_size -
                                         1) // batch_size

        print("x_validation shape : ", x_val.shape[0])
        print("y_validation shape : ", y_val.shape[0])
        print("num of iterations on validation data in one epoch : ",
              self.val_iterations_per_epoch)

        ########################################################

        self.test_data = {'x': x_test, 'y': y_test}

        # iterations to go through test data, +1 if data size not divisible by batch size
        self.test_iterations_per_epoch = (x_test.shape[0] + batch_size -
                                          1) // batch_size

        print("x_test shape : ", x_test.shape[0])
        print("y_test shape : ", y_test.shape[0])
        print("num of iterations on test data in one epoch : ",
              self.test_iterations_per_epoch)

        print("data loading complete ...\n")
Exemplo n.º 11
0
def run_evaluation(dataset_path, predictors, additional_roots=None, max_number_of_queries=None, folds_num=5,
                   evaluation_functions=(('precision', 1), ('precision', 3), ('precision', 5), ('ndcg', 1),
                                         ('ndcg', 3), ('ndcg', 5), ('dcg', 1), ('dcg', 3), ('dcg', 5))):

    evaluation_results = [np.zeros(len(evaluation_functions)) for i in range(len(predictors))]

    for fold in load.load_dataset(dataset_path, additional_roots, max_number_of_queries, folds_num):
        (x_train, y_train, id_train), (x_test, y_test, id_test) = fold

        for index_predictor, predictor in enumerate(predictors):
            # sys.stderr.write(predictor.get_name() + '\n')
            # sys.stderr.flush()

            y_pred = predictor.learn_predict(x_train, y_train, x_test)

            for index_function, (func_type, rank) in enumerate(evaluation_functions):
                evaluation_results[index_predictor][index_function] += Evaluate.mean(func_type, rank,
                                                                                     y_test, y_pred, id_test)

    evaluation_results = [result / folds_num for result in evaluation_results]
    return evaluation_results
Exemplo n.º 12
0
def predict(data_json, model_path):
    preproc = util.load(os.path.dirname(model_path))
    dataset = load.load_dataset(data_json)
    x, y = preproc.process(*dataset)

    model = keras.models.load_model(model_path)
    probs = model.predict(x, verbose=1)

    # evaluate the model
    score = model.evaluate(x, y)

    length_frames = []
    length_predicts = len(dataset[1])

    for length_predict in range(length_predicts):
        length_frames.append(len(dataset[1][length_predict]))

    predict_class = evaluate(probs, length_predicts, length_frames)

    print("The model {} is : {:.2%}".format(model.metrics_names[1], score[1]))

    return probs, predict_class
Exemplo n.º 13
0
def do_confusion():
    ''' generate and print a cross-validated confusion matrix'''
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.cross_validation import cross_val_score
    from sklearn.cross_validation import KFold
    from sklearn.metrics import confusion_matrix

    # load data
    features, labels = load_dataset('seeds')

    # create a sklearn knn classifier
    classifier = KNeighborsClassifier(n_neighbors=4)

    # create a pipeline with prescaler + classifier
    classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])

    kf = KFold(len(features), n_folds=3, shuffle=True)

    names = list(set(labels))
    labels = np.array([names.index(ell) for ell in labels])
    preds = labels.copy()
    preds[:] = -1
    for train, test in kf:
        classifier.fit(features[train], labels[train])
        preds[test] = classifier.predict(features[test])

    cmat = confusion_matrix(labels, preds)
    print(
        'Confusion matrix [rows represent true outcome, columns = predicted outcome]'
    )
    print(cmat)

    # the explicit float() conversion is necessary in Python 2 (otherwise, result is rounded to 0)
    acc = cmat.trace() / float(cmat.sum())
    print('Accuracy: {0:.1%}'.format(acc))
Exemplo n.º 14
0
from __future__ import print_function
import numpy as np
from load import load_dataset
# sklearn implementation of knn
from sklearn.neighbors import KNeighborsClassifier

# load data
features, labels = load_dataset('seeds')


def leave_one_out():

    # create a sklearn knn classifier
    classifier = KNeighborsClassifier(n_neighbors=4)

    n = len(features)
    correct = 0.0

    # leave-one-out training
    for ignorefeat in range(n):
        training = np.ones(n, bool)
        # leave out
        training[ignorefeat] = 0
        testing = ~training

        # fit
        classifier.fit(features[training], labels[training])

        # predict
        prediction = classifier.predict(features[ignorefeat])
Exemplo n.º 15
0
args = parser.parse_args()

## CUDA
device = torch.device('cuda') if torch.cuda.is_available() else torch.device(
    'cpu')

## Setup
eval_dir = os.path.join(
    args.model_dir, f'trainsamples{args.trainsamples}'
    f'_testsamples{args.testsamples}'
    f'_translatetrain{args.translatetrain}'
    f'_translatetest{args.translatetest}')
params = utils.load_params(args.model_dir)

## Data
trainset, testset, num_classes = L.load_dataset(params['data'],
                                                data_dir=params['data_dir'])
X_train, y_train = F.get_samples(trainset, args.trainsamples)
X_test, y_test = F.get_samples(testset, args.testsamples)
if args.translatetrain:
    X_train, y_train = F.translate(X_train, y_train, stride=7)
if args.translatetest:
    X_test, y_test = F.translate(X_test, y_test, stride=7)
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

## Architecture
net = L.load_architecture(params['data'], params['arch'])
net = utils.load_ckpt(args.model_dir, 'model', net)
net = net.to(device)

## Forward
Exemplo n.º 16
0
eval.cleanfile("confusion")
eval.cleanfile("measurement")

LOGISTIC_REGRESSION = False
if len(sys.argv) > 1:
    if ('L' in sys.argv[1:]):
        LOGISTIC_REGRESSION = True

if LOGISTIC_REGRESSION:
    print 'LogisticRegression is Active'
else:
    print 'NB is Active'

print 'Begin Loading samples...'
train_samples, train_target = load.load_dataset(fname=load.filename['TRAIN'],
                                                numdocs=None)
#dev_samples,dev_target = load.load_dataset(fname=load.filename['DEV'],numdocs=None);
print 'number of training sample %d' % len(train_target)
print 'Tags for the last train example', train_target[-1]

#Classifier Model
classifyers = []
classes = []
for each in train_target:
    classes.extend(x for x in each)
classes = set(classes)
print 'Total number of classes for this model ', len(classes)

class_example_count = []
for each in classes:
    Y = [1 if each in x else 0 for x in train_target]
Exemplo n.º 17
0
# gpu-1 adam0.001 reg0.001 a1.0
model_folder_path = "./saved_res_bn/cinc17"
arr = os.listdir(model_folder_path)
arr = sorted(arr)
last_folder = arr[-1]
model_folder_path = "{}/{}/*.hdf5".format(model_folder_path, last_folder)
arr_file = sorted(glob.glob(model_folder_path))
print('arr_file', arr_file)
file_name = arr_file[0]
model_path = file_name
print('Model Path : ', model_path)
# exit()
# model_path = "../../../saved_res_nobn/cinc17/1609222106-676/14.899-0.302-001-16.664-0.284.hdf5"

data = load.load_dataset(data_path)
preproc = util.load(os.path.dirname(model_path))
print('preproc window size : ', preproc.window_size)


class ScaleLayer(Layer):
    def __init__(self, alpha=0):
        super(ScaleLayer, self).__init__()
        self.alpha = alpha
        self.scale = K.variable(self.alpha, dtype='float32', name='alpha')

    def get_config(self):
        return {"alpha": self.alpha}

    def call(self, inputs):
        return inputs * self.scale
Exemplo n.º 18
0
def run_trained_flow(save_folder, data_folder, data_fname):
    print('--------------------')
    print('Train GLOW model ...')
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch GLOW')

    parser.add_argument(
        '--batch-size',
        type=int,
        default=100,
        help='input batch size for training (default: 100)')

    parser.add_argument(
        '--test-batch-size',
        type=int,
        default=1,
        help='input batch size for testing (default: 1000)')

    parser.add_argument(
        '--epochs',
        type=int,
        default=2000,
        help='number of epochs to train (default: 1000)')

    parser.add_argument(
        '--lr', type=float, default=1e-5, help='learning rate (default: 0.0001)')

    parser.add_argument(
        '--no-cuda',
        action='store_true',
        default=False,
        help='disables CUDA training')

    parser.add_argument(
        '--num-blocks',
        type=int,
        default=9,
        help='number of invertible blocks (default: 5)')

    parser.add_argument(
        '--num-hidden',
        type=int,
        default=256,
        help='number of hidden layer neurons')

    parser.add_argument(
        '--num-inputs',
        type=int,
        default=24,
        help='look-ahead horizon of forecasting')

    parser.add_argument(
        '--num-cond-inputs',
        type=int,
        default=24,
        help='length of historical data')

    parser.add_argument(
        '--seed', type=int, default=1, help='random seed (default: 1)')

    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda:0" if args.cuda else "cpu")


    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
    
    kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}

    try:
        os.makedirs(save_folder)
    except OSError:
        pass

    # Load training_subset, valid_set and test_set
    # Just one split: 1-fold
    training_subset, valid_set, test_set = load.load_dataset(data_folder, data_fname)
    print('Training subset size:', training_subset.N)
    print('Validation set size:', valid_set.N)
    print('Test set size:', test_set.N)

    # Load point estimate
    #pred_on_train, pred_on_valid, pred_on_test = load.load_point_estimates(data_folder)

    # Transform to torch.Tensor
    # train_tensor = torch.from_numpy(training_subset.X)
    new_training_subset = np.concatenate((training_subset.y, training_subset.X),-1)
    #new_training_subset = np.concatenate((training_subset.y, pred_on_train),-1)
    mu = new_training_subset.mean()
    std = new_training_subset.std()
    print('Mean of new train set:',mu)
    print('Std of new train set:',std)

    train_tensor = torch.from_numpy((training_subset.X-mu)/std)
    #train_tensor= torch.from_numpy((pred_on_train-mu)/std)
    train_labels = torch.from_numpy((training_subset.y-mu)/std)
    train_dataset = torch.utils.data.TensorDataset(train_tensor, train_labels)

    valid_tensor = torch.from_numpy((valid_set.X-mu)/std)
    #valid_tensor = torch.from_numpy((pred_on_valid-mu)/std)
    valid_labels = torch.from_numpy((valid_set.y-mu)/std)
    valid_dataset = torch.utils.data.TensorDataset(valid_tensor, valid_labels)

    test_tensor = torch.from_numpy((test_set.X-mu)/std)
    #test_tensor = torch.from_numpy((pred_on_test-mu)/std)
    test_labels = torch.from_numpy((test_set.y-mu)/std)
    test_dataset = torch.utils.data.TensorDataset(test_tensor, test_labels)

    train_loader = torch.utils.data.DataLoader(
                                        train_dataset, 
                                        batch_size=args.batch_size, 
                                        shuffle=False, 
                                        **kwargs)

    valid_loader = torch.utils.data.DataLoader(
                                        valid_dataset,
                                        batch_size=args.test_batch_size,
                                        shuffle=False,
                                        drop_last=False,
                                        **kwargs)

    test_loader = torch.utils.data.DataLoader(
                                        test_dataset,
                                        batch_size=args.test_batch_size,
                                        shuffle=False,
                                        drop_last=False,
                                        **kwargs)

    num_inputs = args.num_inputs
    num_cond_inputs = args.num_cond_inputs
    num_hidden = args.num_hidden 

    def build_model():
        modules = []

        mask = torch.arange(0, num_inputs) % 2
        #mask = torch.ones(num_inputs)
        #mask[round(num_inputs/2):] = 0
        mask = mask.to(device).float()

        # build each modules
        for _ in range(args.num_blocks):
            modules += [
                fnn.ActNorm(num_inputs),
                fnn.LUInvertibleMM(num_inputs),
                fnn.CouplingLayer(
                    num_inputs, num_hidden, mask, num_cond_inputs,
                    s_act='tanh', t_act='relu')
                ]
            mask = 1 - mask

        # build model
        model = fnn.FlowSequential(*modules)

        # initialize
        for module in model.modules():
            if isinstance(module, nn.Linear):
                nn.init.orthogonal_(module.weight)
                if hasattr(module, 'bias') and module.bias is not None:
                    module.bias.data.fill_(0)

        model.to(device)

        return model

    model = build_model()
    # Save trained model
    #torch.save(best_model, save_folder+'best_model.pt')
    #model = torch.load(save_folder+'best_model.pt',
        #map_location=lambda storage, loc: storage)
    model = torch.load(save_folder+'best_model.pt')

    def calculate_dist(true, generated):
        distance = {}
        for t in range(generated.shape[1]):
        
            y = true[t]
            y_hat = generated[:,t]
        
            dist = []
            for p in range(50, 101, 1):
                if p == 50:
                    median = stats.scoreatpercentile(y_hat, p)
                    dist.append(np.abs(y - median))
                else:
                    pl = 100 - p 
                    pu = p
                    l = stats.scoreatpercentile(y_hat, pl)
                    u = stats.scoreatpercentile(y_hat, pu)
                    
                    if y <= u and y >= l:
                        dist.append(0.0)
                    elif y < l:
                        dist.append(np.abs(y - l))
                    else:
                        dist.append(np.abs(y - u))
                        
            # distance for each hour t            
            dist = np.array(dist)
            distance[t] = dist

        series = pd.DataFrame.from_dict(distance)
        series = series.mean(axis = 1)

        return series.values

    def test(model, test_loader):
        model.eval()
        median_pred = []
        ground_truth = []
        point_pred = []
        pi_1 = []
        pi_99 = []
        pi_5 = []
        pi_95 = []
        pi_15 = []
        pi_85 = []
        pi_25 = []
        pi_75 = []
        distance = {}
        for index, data in enumerate(test_loader):
            #if index == 2: break
            inputs = data[0]
            cond_inputs = data[1]

            with torch.no_grad():
                cond_inputs_ = cond_inputs.view(-1,num_cond_inputs) * torch.ones([5000,num_cond_inputs])
                yt_hat =  model.sample(5000, cond_inputs = cond_inputs_).detach().cpu().numpy()
                
                #test_data = test_set.X[index,:].flatten()
                input_data = inputs.detach().numpy().flatten()
                cond_data = cond_inputs.detach().numpy().flatten()
        
                input_data = input_data*std + mu
                cond_data = cond_data*std + mu
                synth = yt_hat*std + mu
            median = stats.scoreatpercentile(synth, 50, axis = 0)
            percentile1 = stats.scoreatpercentile(synth, 1, axis = 0) 
            percentile99 = stats.scoreatpercentile(synth, 99, axis = 0) 
            percentile5 = stats.scoreatpercentile(synth, 5, axis = 0) 
            percentile95 = stats.scoreatpercentile(synth, 95, axis = 0)
            percentile15 = stats.scoreatpercentile(synth, 15, axis = 0) 
            percentile85 = stats.scoreatpercentile(synth, 85, axis = 0)
            percentile25 = stats.scoreatpercentile(synth, 25, axis = 0) 
            percentile75 = stats.scoreatpercentile(synth, 75, axis = 0)
            if index == 0:
                median_pred = median
                ground_truth = input_data
                pi_1 = percentile1
                pi_99 = percentile99
                pi_5 = percentile5
                pi_95 = percentile95
                pi_15 = percentile15
                pi_85 = percentile85
                pi_25 = percentile25
                pi_75 = percentile75
            else:
                median_pred = np.concatenate((median_pred, median))
                ground_truth = np.concatenate((ground_truth, input_data))
                pi_1 = np.concatenate((pi_1, percentile1))
                pi_99 = np.concatenate((pi_99, percentile99))
                pi_5 = np.concatenate((pi_5, percentile5))
                pi_95 = np.concatenate((pi_95, percentile95))
                pi_15 = np.concatenate((pi_15, percentile15))
                pi_85 = np.concatenate((pi_85, percentile85))
                pi_25 = np.concatenate((pi_25, percentile25))
                pi_75 = np.concatenate((pi_75, percentile75))
        
            # distance of test data {index} averaged over 24 hours
            distance[index] = calculate_dist(input_data, synth)

        GLOW_pred_dict = {}
        GLOW_pred_dict['median_pred'] = median_pred
        GLOW_pred_dict['ground_truth'] = ground_truth
        GLOW_pred_dict['pi1'] = pi_1
        GLOW_pred_dict['pi99'] = pi_99
        GLOW_pred_dict['pi5'] = pi_5
        GLOW_pred_dict['pi95'] = pi_95
        GLOW_pred_dict['pi15'] = pi_15
        GLOW_pred_dict['pi85'] = pi_85
        GLOW_pred_dict['pi25'] = pi_25
        GLOW_pred_dict['pi75'] = pi_75
        # Save GLOW_pred_dict as .csv file
        GLOW_pred = pd.DataFrame.from_dict(GLOW_pred_dict)
        GLOW_pred.to_csv(save_folder+'GLOW_pred.csv')


        GLOW_distance = pd.DataFrame.from_dict(distance)
        GLOW_distance.to_csv(save_folder+'GLOW_distance.csv')
        #series = series.mean(axis = 1)

        #GLOW_distance = series.values 
        # Save GLOW_distance as an array
        #np.save(save_folder+'GLOW_distance.npy', GLOW_distance)

        return None

    test(model, test_loader)
Exemplo n.º 19
0
        print "File " + filename + " does not exist!"
        sys.exit(0)
    else:
        with f:
            loaded_obj = cPickle.load(f)
    return loaded_obj

parser2 = argparse.ArgumentParser()
parser2.add_argument("teX", help="Provide filename for test dataset you want to use (reads). It should have been in 'media/'"
                                 "directory and filename should end with '-teX.fasta.gz'", type=str)
parser2.add_argument("best_model", help="Provide filename for the best model. Filename must include directory. Must be of"
                                        "format 'best_model_with_params-[timestamp].pkl'.", type=str)
parser2.add_argument("-teY", help="Provide filename for test dataset you want to use (classes). It should have been in 'media/'"
                                   "directory and filename should end with '-teY.fasta.gz'", type=str)
results = parser2.parse_args()
teX = np.asarray(load_dataset(results.teX))
teX = teX.reshape(-1, 1, 1, teX.shape[1])
teY = load_dataset(results.teY)
best_model = results.best_model

# teX_filename = "media/2114bef791b6111f12575439a7bbed73_4_0.200_100_1_0_20-teX.fasta.gz"
# teY_filename = "media/2114bef791b6111f12575439a7bbed73_4_0.200_100_1_0_20-teY.fasta.gz"
# model_filename = "models/best_model_with_params-1468304923-improving-eval.pkl"
# teX = np.asarray(load_dataset(teX_filename))
# teX = teX.reshape(-1, 1, 1, teX.shape[1])
# teY = np.asarray(load_dataset(teY_filename))
# best_model = model_filename

# initialize matrices
X = T.ftensor4()
Y = T.fmatrix()
Exemplo n.º 20
0
import load as ld
from lasagne import layers
from lasagne.nonlinearities import softmax
from nolearn.lasagne import NeuralNet
import cPickle as pickle
from nolearn.lasagne import BatchIterator

path = '/home/tg/Documents/bogo/converted_database/'

X, Y = ld.load_dataset(folder_path = path, filter_input = True, max_number_states = 10)

_, num_features, size, _ = X.shape

print X.shape
# exit(0)

NUM_FILTERS = 32

net = NeuralNet(
    layers=[
        ('input', layers.InputLayer),
        # ('pad1', layers.shape.PadLayer),
        ('conv1', layers.Conv2DLayer),
        ('conv2', layers.Conv2DLayer),
        ('hidden4', layers.DenseLayer),
        ('output', layers.DenseLayer),
    ],
    input_shape = (None, num_features, size, size),
    conv1_num_filters = NUM_FILTERS,
    conv1_filter_size = (5,5),
    conv2_num_filters = NUM_FILTERS,
Exemplo n.º 21
0
def train_W_flow(save_folder, data_folder, data_fname):
    print('--------------------')
    print('Train GLOW model ...')
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch GLOW')

    parser.add_argument('--batch-size',
                        type=int,
                        default=100,
                        help='input batch size for training (default: 100)')

    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1,
                        help='input batch size for testing (default: 1000)')

    parser.add_argument('--epochs',
                        type=int,
                        default=2,
                        help='number of epochs to train (default: 1000)')

    parser.add_argument('--lr',
                        type=float,
                        default=1e-5,
                        help='learning rate (default: 0.0001)')

    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')

    parser.add_argument('--num-blocks',
                        type=int,
                        default=9,
                        help='number of invertible blocks (default: 5)')

    parser.add_argument('--num-hidden',
                        type=int,
                        default=256,
                        help='number of hidden layer neurons')

    parser.add_argument('--num-inputs',
                        type=int,
                        default=24,
                        help='look-ahead horizon of forecasting')

    parser.add_argument('--num-cond-inputs',
                        type=int,
                        default=24,
                        help='length of historical data')

    parser.add_argument(
        '--weight',
        type=int,
        default=1,
        help='trade off KL-divergence and Wasserstein distance')

    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')

    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda:0" if args.cuda else "cpu")

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}

    try:
        os.makedirs(save_folder)
    except OSError:
        pass

    # Load training_subset, valid_set and test_set
    # Just one split: 1-fold
    training_subset, valid_set, test_set = load.load_dataset(
        data_folder, data_fname)
    print('Training subset size:', training_subset.N)
    print('Validation set size:', valid_set.N)
    print('Test set size:', test_set.N)

    # Load point estimate
    #pred_on_train, pred_on_valid, pred_on_test = load.load_point_estimates(data_folder)

    # Transform to torch.Tensor
    # train_tensor = torch.from_numpy(training_subset.X)
    new_training_subset = np.concatenate(
        (training_subset.y, training_subset.X), -1)
    #new_training_subset = np.concatenate((training_subset.y, pred_on_train),-1)
    mu = new_training_subset.mean()
    std = new_training_subset.std()
    print('Mean of new train set:', mu)
    print('Std of new train set:', std)

    train_tensor = torch.from_numpy((training_subset.X - mu) / std)
    #train_tensor= torch.from_numpy((pred_on_train-mu)/std)
    train_labels = torch.from_numpy((training_subset.y - mu) / std)
    train_dataset = torch.utils.data.TensorDataset(train_tensor, train_labels)

    valid_tensor = torch.from_numpy((valid_set.X - mu) / std)
    #valid_tensor = torch.from_numpy((pred_on_valid-mu)/std)
    valid_labels = torch.from_numpy((valid_set.y - mu) / std)
    valid_dataset = torch.utils.data.TensorDataset(valid_tensor, valid_labels)

    test_tensor = torch.from_numpy((test_set.X - mu) / std)
    #test_tensor = torch.from_numpy((pred_on_test-mu)/std)
    test_labels = torch.from_numpy((test_set.y - mu) / std)
    test_dataset = torch.utils.data.TensorDataset(test_tensor, test_labels)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               **kwargs)

    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=args.test_batch_size,
                                               shuffle=False,
                                               drop_last=False,
                                               **kwargs)

    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch_size,
                                              shuffle=False,
                                              drop_last=False,
                                              **kwargs)

    num_inputs = args.num_inputs
    num_cond_inputs = args.num_cond_inputs
    num_hidden = args.num_hidden

    def build_model():
        modules = []

        mask = torch.arange(0, num_inputs) % 2
        #mask = torch.ones(num_inputs)
        #mask[round(num_inputs/2):] = 0
        mask = mask.to(device).float()

        # build each modules
        for _ in range(args.num_blocks):
            modules += [
                fnn.ActNorm(num_inputs),
                fnn.LUInvertibleMM(num_inputs),
                fnn.CouplingLayer(num_inputs,
                                  num_hidden,
                                  mask,
                                  num_cond_inputs,
                                  s_act='tanh',
                                  t_act='relu')
            ]
            mask = 1 - mask

        # build model
        model = fnn.FlowSequential(*modules)

        # initialize
        for module in model.modules():
            if isinstance(module, nn.Linear):
                nn.init.orthogonal_(module.weight)
                if hasattr(module, 'bias') and module.bias is not None:
                    module.bias.data.fill_(0)

        model.to(device)

        return model

    model = build_model()
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-6)

    # Build Discriminator
    class Flatten(nn.Module):
        def forward(self, input):
            return input.view(input.size()[0], -1)

    class Reshape(nn.Module):
        def forward(self, input):
            return input.view(input.size()[0], 1, -1)

    class Discriminator(nn.Module):
        def __init__(self):
            super(Discriminator, self).__init__()

            self.layers = nn.Sequential(
                Reshape(),
                nn.Conv1d(in_channels=1,
                          out_channels=8,
                          kernel_size=5,
                          stride=1,
                          dilation=1), nn.BatchNorm1d(8), nn.LeakyReLU(),
                nn.Conv1d(in_channels=8,
                          out_channels=8,
                          kernel_size=5,
                          stride=1,
                          dilation=1), nn.BatchNorm1d(8), nn.LeakyReLU(),
                nn.MaxPool1d(3, stride=2), Flatten(), nn.Linear(19 * 8, 1))

        def forward(self, input):
            return self.layers(input)

    def clamp_weight(model):

        for module in model.modules():
            if isinstance(module, nn.Conv1d) or isinstance(
                    module, nn.Linear) or isinstance(module, nn.BatchNorm1d):
                module.weight.data = torch.clamp(module.weight.data, -1e-2,
                                                 1e-2)
                if module.bias is not None:
                    module.bias.data = torch.clamp(module.bias.data, -1e-2,
                                                   1e-2)

    discriminator = Discriminator()
    disc_optimizer = optim.Adam(discriminator.parameters(),
                                lr=args.lr,
                                weight_decay=1e-6)

    def train_discriminator(data, cond_data):
        for param in discriminator.parameters():
            param.requires_grad = True
        u = torch.Tensor(data.shape[0], data.shape[1]).normal_()
        synth_data, _ = model.forward(u, cond_data, mode='inverse')
        for i in range(5):
            # using detach to freeze the flow-based generative model
            disc_optimizer.zero_grad()
            _data = torch.cat((data, cond_data), 1)
            _synth_data = torch.cat((synth_data, cond_data), 1)
            loss = -discriminator(_data).mean() + discriminator(
                _synth_data.detach()).mean()
            loss.backward()
            disc_optimizer.step()
            clamp_weight(discriminator)

    def calculate_W_distance(data, cond_data, discriminator):

        u = torch.Tensor(data.shape[0], data.shape[1]).normal_()
        synth_data, _ = model.forward(u, cond_data, mode='inverse')

        # Freeze the discriminator for evaluating Wasserstein distance
        best_discriminator = discriminator
        for param in best_discriminator.parameters():
            param.requires_grad = False
        _data = torch.cat((data, cond_data), 1)
        _synth_data = torch.cat((synth_data, cond_data), 1)
        W_distance = best_discriminator(_data).mean() - best_discriminator(
            _synth_data).mean()

        return W_distance

    # Start training generative flow
    train_loss = []

    def train(epoch):
        model.train()

        for batch_idx, data in enumerate(train_loader):
            if isinstance(data, list):
                if len(data) > 1:
                    cond_data = data[1].float()
                    cond_data = cond_data.to(device)
                else:
                    cond_data = None

                data = data[0]
            data = data.to(device)
            optimizer.zero_grad()
            loss = -model.log_probs(data, cond_data).mean()
            train_loss.append(loss.item())

            # Adding Wasserstein distance as regularizer
            train_discriminator(data, cond_data)
            W_distance = calculate_W_distance(data, cond_data, discriminator)
            # Total loss
            loss += args.weight * W_distance

            loss.backward()
            optimizer.step()

    def validate(epoch, model, loader, prefix='Validation'):
        model.eval()
        val_loss = 0

        for batch_idx, data in enumerate(loader):
            if isinstance(data, list):
                if len(data) > 1:
                    cond_data = data[1].float()
                    cond_data = cond_data.to(device)
                else:
                    cond_data = None

                data = data[0]
            data = data.to(device)
            with torch.no_grad():
                val_loss += -model.log_probs(data, cond_data).sum().item()

        return val_loss / len(loader.dataset)

    best_validation_loss = float('inf')
    best_validation_epoch = 0
    best_model = model

    valid_loss = []

    for epoch in range(args.epochs):
        print('\nEpoch: {}'.format(epoch))

        train(epoch)
        validation_loss = validate(epoch, model, valid_loader)

        valid_loss.append(validation_loss)

        if epoch - best_validation_epoch >= 30 and epoch > 100:
            #if epoch - best_validation_epoch >= 30:
            break

        if validation_loss < best_validation_loss:
            best_validation_epoch = epoch
            best_validation_loss = validation_loss
            best_model = copy.deepcopy(model)

        print(
            'Best validation at epoch {}: Average Log Likelihood in nats: {:.4f}'
            .format(best_validation_epoch, -best_validation_loss))

    plt.figure(figsize=(10, 10))
    plt.plot(range(len(valid_loss)), valid_loss)
    plt.title('validation loss over epochs')
    plt.savefig(save_folder + 'valid_loss.png')

    # Save trained model
    torch.save(best_model, save_folder + 'best_model.pt')

    def calculate_dist(true, generated):
        distance_of_one_sample = []
        for t in range(generated.shape[1]):

            y = true[t]
            y_hat = generated[:, t]

            dist = []
            for p in range(50, 101, 1):
                if p == 50:
                    median = stats.scoreatpercentile(y_hat, p)
                    dist.append(np.abs(y - median))
                else:
                    pl = 100 - p
                    pu = p
                    l = stats.scoreatpercentile(y_hat, pl)
                    u = stats.scoreatpercentile(y_hat, pu)

                    if y <= u and y >= l:
                        dist.append(0.0)
                    elif y < l:
                        dist.append(np.abs(y - l))
                    else:
                        dist.append(np.abs(y - u))

            dist = np.array(dist)
            if t == 0:
                distance_of_one_sample = dist
            else:
                distance_of_one_sample += dist

        return distance_of_one_sample / 24

    def test(model, test_loader):
        model.eval()
        median_pred = []
        ground_truth = []
        point_pred = []
        pi_1 = []
        pi_99 = []
        pi_5 = []
        pi_95 = []
        pi_15 = []
        pi_85 = []
        pi_25 = []
        pi_75 = []
        distance = {}
        for index, data in enumerate(test_loader):
            if index == 2: break
            inputs = data[0]
            cond_inputs = data[1]

            with torch.no_grad():
                cond_inputs_ = cond_inputs.view(
                    -1, num_cond_inputs) * torch.ones([5000, num_cond_inputs])
                yt_hat = model.sample(
                    5000, cond_inputs=cond_inputs_).detach().cpu().numpy()

                #test_data = test_set.X[index,:].flatten()
                input_data = inputs.detach().numpy().flatten()
                cond_data = cond_inputs.detach().numpy().flatten()

                input_data = input_data * std + mu
                cond_data = cond_data * std + mu
                synth = yt_hat * std + mu
            median = stats.scoreatpercentile(synth, 50, axis=0)
            percentile1 = stats.scoreatpercentile(synth, 1, axis=0)
            percentile99 = stats.scoreatpercentile(synth, 99, axis=0)
            percentile5 = stats.scoreatpercentile(synth, 5, axis=0)
            percentile95 = stats.scoreatpercentile(synth, 95, axis=0)
            percentile15 = stats.scoreatpercentile(synth, 15, axis=0)
            percentile85 = stats.scoreatpercentile(synth, 85, axis=0)
            percentile25 = stats.scoreatpercentile(synth, 25, axis=0)
            percentile75 = stats.scoreatpercentile(synth, 75, axis=0)
            if index == 0:
                median_pred = median
                ground_truth = input_data
                pi_1 = percentile1
                pi_99 = percentile99
                pi_5 = percentile5
                pi_95 = percentile95
                pi_15 = percentile15
                pi_85 = percentile85
                pi_25 = percentile25
                pi_75 = percentile75
            else:
                median_pred = np.concatenate((median_pred, median))
                ground_truth = np.concatenate((ground_truth, input_data))
                pi_1 = np.concatenate((pi_1, percentile1))
                pi_99 = np.concatenate((pi_99, percentile99))
                pi_5 = np.concatenate((pi_5, percentile5))
                pi_95 = np.concatenate((pi_95, percentile95))
                pi_15 = np.concatenate((pi_15, percentile15))
                pi_85 = np.concatenate((pi_85, percentile85))
                pi_25 = np.concatenate((pi_25, percentile25))
                pi_75 = np.concatenate((pi_75, percentile75))

            # distance of test data {index} averaged over 24 hours
            distance[index] = calculate_dist(input_data, synth)

        GLOW_pred_dict = {}
        GLOW_pred_dict['median_pred'] = median_pred
        GLOW_pred_dict['ground_truth'] = ground_truth
        GLOW_pred_dict['pi1'] = pi_1
        GLOW_pred_dict['pi99'] = pi_99
        GLOW_pred_dict['pi5'] = pi_5
        GLOW_pred_dict['pi95'] = pi_95
        GLOW_pred_dict['pi15'] = pi_15
        GLOW_pred_dict['pi85'] = pi_85
        GLOW_pred_dict['pi25'] = pi_25
        GLOW_pred_dict['pi75'] = pi_75
        # Save GLOW_pred_dict as .csv file
        GLOW_pred = pd.DataFrame.from_dict(GLOW_pred_dict)
        GLOW_pred.to_csv(save_folder + 'GLOW_pred.csv')

        GLOW_distance = pd.DataFrame.from_dict(distance)
        GLOW_distance.to_csv(save_folder + 'GLOW_distance.csv')
        #series = series.mean(axis = 1)

        #GLOW_distance = series.values
        # Save GLOW_distance as an array
        #np.save(save_folder+'GLOW_distance.npy', GLOW_distance)

        return None

    test(model, test_loader)
Exemplo n.º 22
0
def test_iris():
    features, labels = load_dataset('iris')
    assert len(features[0]) == 4
    assert len(features)
    assert len(features) == len(labels)
Exemplo n.º 23
0
def test_seeds():
    features, labels = load_dataset('seeds')
    assert len(features[0]) == 7
    assert len(features)
    assert len(features) == len(labels)
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License

from load import load_dataset
import numpy as np
from knn import learn_model, apply_model, accuracy

features, labels = load_dataset("seeds")


def cross_validate(features, labels):
    error = 0.0
    for fold in range(10):
        training = np.ones(len(features), bool)
        training[fold::10] = 0
        testing = ~training
        model = learn_model(1, features[training], labels[training])
        test_error = accuracy(features[testing], labels[testing], model)
        error += test_error

    return error / 10.0


error = cross_validate(features, labels)
print("Ten fold cross-validated error was {0:.1%}.".format(error))

features -= features.mean(0)
Exemplo n.º 25
0
def test_experiment_projects_with_embeddings():
    libraries, keywords, lib_key_graph, test_domains_libraries, test_domains_keywords, train_domains_libraries, \
    train_domains_keywords = load_dataset(number_of_methods=100000, num_of_keywords_after_dot=0)

    libraries, keywords, idf_dict = load_model_data()
    have_embeddings = "True"
    random_predict = "False"
    similarity_methods = ['cosine']
    proximities = ['both']
    idf_uses = ['True']

    embeddings_first = nx.read_gpickle(
        'line_algo\data\embedding_first-order.gpickle')
    embeddings_second = nx.read_gpickle(
        'line_algo\data\embedding_second-order.gpickle')

    results = open('results.csv', mode='w')
    results = csv.writer(results,
                         delimiter=',',
                         quotechar='"',
                         quoting=csv.QUOTE_MINIMAL)
    results.writerow([
        "1st Prox.", "2st Prox.", 'idf', "Similarity", "HitRate@10", "AUC",
        "NDCG", "Coverage"
    ])
    for proximity in proximities:
        for idf_use in idf_uses:
            for similarity_method in similarity_methods:

                embeddings = {}
                for node in lib_key_graph.nodes():
                    if proximity == 'first':
                        embeddings[node] = embeddings_first[node]
                    elif proximity == 'second':
                        embeddings[node] = embeddings_second[node]
                    else:
                        embeddings[node] = np.concatenate(
                            (embeddings_first[node],
                             0.3 * embeddings_second[node]),
                            axis=None)

                results.writerow([proximity, similarity_method, idf_use])
                coverage = []
                hit_rate = []
                auc = []
                ndcg = []

                if similarity_method == 'function':
                    # Create training set for the model of the similarity prediction
                    training_features, training_values = relation_model.create_training_set(
                        lib_key_graph, embeddings, libraries, keywords)
                    # Train the relation model
                    scaler, model = relation_model.train_relation_model(
                        training_features, training_values)

                libraries_predicted_list = []
                # Store the results in a file
                for domain in test_domains_libraries.keys():
                    # Check if libraries was identified in this domain
                    if len(test_domains_libraries[domain]) >= 0 and len(
                            test_domains_libraries[domain]) > 5:

                        path_keywords = test_domains_keywords[domain]
                        path_libraries = test_domains_libraries[domain]
                        print('Predict path: ', domain)
                        print("Number of libraries in this file: ",
                              len(test_domains_libraries[domain]))

                        # Calculate similarity and save it in a dictionary
                        if similarity_method == "function":
                            sim = caclulate_function_similarity(libraries,
                                                                embeddings,
                                                                lib_key_graph,
                                                                path_keywords,
                                                                scaler,
                                                                model,
                                                                idf_dict,
                                                                idf=idf_use)
                        else:
                            sim = calculate_similarity(
                                libraries,
                                embeddings,
                                lib_key_graph,
                                path_keywords,
                                idf_dict,
                                method=similarity_method,
                                idf=idf_use)
                        # print(sim)
                        # Get the largest 5 values
                        predicted_libraries = nlargest(10, sim, key=sim.get)
                        print("Libraries predicted: ", predicted_libraries)
                        print("Path libraries:", path_libraries, "\n")

                        libraries_predicted_list = libraries_predicted_list + predicted_libraries
                        for library in predicted_libraries:
                            if library in path_libraries:
                                print(library)
                        # Hit rate for Top-5 libraries
                        hit_rate_temp = calculate_hit_rate(
                            path_libraries, predicted_libraries)
                        hit_rate.append(hit_rate_temp)
                        print("Hit Rate @", len(predicted_libraries), ": ",
                              hit_rate_temp)
                        # Calculate AUC
                        labels = [
                            1 if library in path_libraries else 0
                            for library in sim.keys()
                        ]
                        conf = list(sim.values())
                        if 1 in labels and 0 in labels:
                            auc_temp = roc_auc_score(np.array(labels),
                                                     np.array(conf))
                            auc.append(auc_temp)
                            print("ROC AUC: ", auc_temp, "\n")
                        # Calculate Normalized Cumulative Score
                        # Relevance score=1 if a library that was predicted is in path's libraries
                        ndcg_temp = ndcg_score([np.array(labels)],
                                               [np.array(conf)])
                        ndcg.append(ndcg_temp)
                        print("Discounted Cumulative Gain: ",
                              ndcg_score([np.array(labels)], [np.array(conf)]),
                              '\n')

                libraries_predicted_list = list(set(libraries_predicted_list))
                results.writerow([
                    sum(hit_rate) / len(hit_rate),
                    sum(auc) / len(auc),
                    sum(ndcg) / len(ndcg),
                    len(libraries_predicted_list) / len(libraries) * 100
                ])
                results.writerow([np.std(hit_rate), np.std(auc), np.std(ndcg)])
                coverage.append(len(libraries_predicted_list) / len(libraries))
Exemplo n.º 26
0
# encoding:utf-8
from load import load_dataset, get_samples
import json

raw_train_data = "../data/input/train-data-test"
raw_dev_data = "../data/input/dev-data-test"
raw_test_data = "../data/input/test-data-test"
data_size = 100000
n_prev_sents = 5  # 上下文长度
max_n_words = 20  # 句子长度

if __name__ == "__main__":
    # load dataset
    train_data = load_dataset(raw_train_data, data_size)
    dev_data = load_dataset(raw_train_data, data_size)
    test_data = load_dataset(raw_train_data, data_size)

    # create_samples
    train_samples = get_samples(threads=train_data,
                                n_prev_sents=n_prev_sents,
                                max_n_words=max_n_words,
                                pad=False)
    with open('train_cand_2.txt', 'a') as fw:
        for sample in train_samples:
            sample_dict = {}
            sample_dict['context'] = sample.context
            sample_dict['response'] = sample.response
            sample_dict['spk_agents'] = sample.spk_agents
            sample_dict['true_adr'] = sample.true_adr
            sample_dict['true_res'] = sample.true_res
            # sample_dict['agent_index_dict'] = sample.agent_index_dict
Exemplo n.º 27
0
def train_AR(data_folder, data_fname, GLOW_data_folder, GLOW_data_fname, model_save_folder):
    # Training settings
    print('----------------------------------')
    print('Pre-train Autoregressive model ...')
    parser = argparse.ArgumentParser(description='PyTorch GLOW')
    parser.add_argument(
        '--epochs',
        type=int,
        default=2000,
        help='number of epochs to train (default: 500)')

    parser.add_argument(
        '--lr', type=float, default=1e-5, help='learning rate (default: 0.0001)')

    parser.add_argument(
        '--num-inputs',
        type=int,
        default=24,
        help='look-ahead horizon of forecasting')

    parser.add_argument(
        '--num-cond-inputs',
        type=int,
        default=24,
        help='length of historical data')

    parser.add_argument(
        '--order',
        type=int,
        default=24,
        help='order of Autoregressive model')

    parser.add_argument(
        '--delta',
        type=int,
        default=1e-4,
        help='stopping criterion')

    args = parser.parse_args()

    #try:
        #os.makedirs(model_save_folder)
    #except OSError:
        #pass

    # Define ARModel class
    class ARModel(nn.Module):

        def __init__(self, input_dim, output_dim):

            super(ARModel, self).__init__() 
            self.linear = nn.Linear(input_dim, output_dim)

        def forward(self, x):
            out = self.linear(x)
            return out

    input_dim = args.num_cond_inputs
    output_dim = 1
    model = ARModel(input_dim,output_dim)

    # Load dataset
    training_subset, valid_set, test_set = load.load_dataset(data_folder, data_fname)
    print('Training subset size:', training_subset.N)
    print('Validation set size:', valid_set.N)
    print('Test set size:', test_set.N)

    # Transform to torch.Tensor
    train_tensor = torch.from_numpy(training_subset.X)
    train_labels = torch.from_numpy(training_subset.y)
    train_dataset = torch.utils.data.TensorDataset(train_tensor, train_labels)

    valid_tensor = torch.from_numpy(valid_set.X)
    valid_labels = torch.from_numpy(valid_set.y)
    valid_dataset = torch.utils.data.TensorDataset(valid_tensor, valid_labels)

    test_tensor = torch.from_numpy(test_set.X)
    test_labels = torch.from_numpy(test_set.y)
    test_dataset = torch.utils.data.TensorDataset(test_tensor, test_labels)
        
    train_loader = torch.utils.data.DataLoader(
                                            train_dataset, 
                                            batch_size = 1, 
                                            shuffle = False)

    valid_loader = torch.utils.data.DataLoader(
                                            valid_dataset,
                                            batch_size = 1,
                                            shuffle = False,
                                            drop_last = False)

    test_loader = torch.utils.data.DataLoader(
                                            test_dataset,
                                            batch_size = 1,
                                            shuffle = False,
                                            drop_last = False)
    # Define loss function and optimizer
    criterion = nn.MSELoss()# Mean Squared Loss
    optimizer = torch.optim.SGD(model.parameters(), lr = args.lr) #Stochastic Gradient Descent

    # Train AR
    train_loss = []
    def train(epoch):
        model.train()
        for batch_idx, data in enumerate(train_loader):
            if isinstance(data, list):
                if len(data) > 1:
                    cond_data = data[1].squeeze()
                else:
                    cond_data = None

                input_data = data[0].squeeze()
            optimizer.zero_grad()

            num_inputs = input_data.shape[0]
            history = cond_data
            pred = []
            for t in range(num_inputs):
                yt_hat = model.forward(history)
                history = torch.cat((history, yt_hat))
                history = history[-24:]
                if t == 0:
                    pred = yt_hat

                else:
                    pred = torch.cat((pred, yt_hat))

            loss = criterion(pred, input_data)
            train_loss.append(loss.detach().item()) 

            loss.backward()
            optimizer.step()


    def validate(epoch, model, valid_loader):
        model.eval()
        val_loss = 0

        for batch_idx, data in enumerate(valid_loader):
            if isinstance(data, list):
                if len(data) > 1:
                    cond_data = data[1].squeeze()
                else:
                    cond_data = None

                input_data = data[0].squeeze()

            with torch.no_grad():
                history = cond_data
                pred = []
                for t in range(input_data.shape[0]):
                    yt_hat = model.forward(history)
                    history = torch.cat((history, yt_hat))
                    history = history[-24:]
                    if t == 0:
                        pred = yt_hat

                    else:
                        pred = torch.cat((pred, yt_hat))
                val_loss +=  criterion(pred, input_data).detach().item()

        return val_loss / valid_set.N


    best_validation_loss = float('inf')
    best_validation_epoch = 0
    best_model = model

    valid_loss = []
    for epoch in range(args.epochs):
        print('\nEpoch: {}'.format(epoch))

        train(epoch)

        validation_loss = validate(epoch, model, valid_loader)
        valid_loss.append(validation_loss)

        if epoch - best_validation_epoch >= 10:
            break

        if validation_loss < best_validation_loss - args.delta:
            best_validation_epoch = epoch
            best_validation_loss = validation_loss
            best_model = copy.deepcopy(model)

        print(
            'Best validation at epoch {}: Average mse: {:.4f}'.
            format(best_validation_epoch, best_validation_loss))

    plt.figure(figsize=(10,10))
    plt.plot(range(len(valid_loss)), valid_loss)
    plt.title('validation loss over epochs')
    plt.savefig(model_save_folder+'pretrain_AR_valid_loss.png')


    # Test pre-trained AR
    def test(model, test_loader):
        model.eval()
        predictions = []
        test_data = []
        for index, data in enumerate(test_loader):
            #if index == 2: break
            input_data = data[0].squeeze()
            cond_data = data[1].squeeze()

            with torch.no_grad():
                history = cond_data
                pred = []
                for t in range(input_data.shape[0]):
                    yt_hat = model.forward(history)
                    history = torch.cat((history, yt_hat))
                    history = history[-24:]

                    if t == 0:
                        pred = yt_hat
                    else:
                        pred = torch.cat((pred, yt_hat))

                if index == 0:
                    predictions = pred
                    test_data = input_data
                else:
                    predictions = torch.cat((predictions, pred))
                    test_data = torch.cat((test_data, input_data))

        return predictions, test_data


    predictions, test_data = test(best_model, test_loader)
    # Calculate MSE, plot predictions versus test_data
    print('Pretrain MSE on test data:', criterion(predictions, test_data).detach().item())

    # Save trained model
    #torch.save(best_model, model_save_folder+'best_model.pt')
    torch.save(best_model.state_dict(), model_save_folder+'pretrained_ARmodel.pt')

    # generate point estimates
    run_ar.generate_point_est(best_model, GLOW_data_folder, GLOW_data_fname)
    
    return best_model 
Exemplo n.º 28
0
def evaluate(models,classes):
	models, fname = zip(*models);
	fname=fname[0]

	print 'Loading Test dataset...'
	dev_samples,gold = load.load_dataset(fname=load.filename['TEST'],numdocs=1000);
	[tp,fp,fn,tn] = [0.0,0.0,0.0,0.0]
	keyword_stats=[]
	confusion=[]
	for each in models:
		confusion.append({e:[[],[]] for e in classes})
		keyword_stats.append({e:[0.0,0.0,len(dev_samples)*1.0,0.0] for e in classes});
		print 'Evaluation Cache for %s is not present' %fname
		pred = each.classify(dev_samples); #a sorted vector of strings
		assert(len(pred) ==len(dev_samples));		
		for no,each in enumerate(pred):
			print '\rVerifying output for example %d' %no,
			assert(type(each) == list);
			p=set(each)&classes; 
			q=set(gold[no])&classes;
			r = p&q;
			tp += len(p&q);
			tn += len(classes)-len(p|q);
			fp += len(p)-len(p&q)
			fn += len(q)-len(p&q)		


			for every in r: keyword_stats[-1][every][0]+=1; #tp
			for every in p-r: keyword_stats[-1][every][1]+=1; #fp
			for every in p|q: keyword_stats[-1][every][2]-=1; #tn
			for every in q-r: keyword_stats[-1][every][3]+=1; #fn

			#for every in r: confusion[-1][every][0].append(exampleno); #tp
			for every in p-r: confusion[-1][every][0].append(no); #fp
			#for every in p|q: keyword_stats[-1][every][2].append(exampleno); #tn
			for every in q-r: confusion[-1][every][1].append(no); #fn

		#write into file	
		#print keyword_stats[-1]
		#print [tp,tn,fp,fn]

		with open(fname, 'wb') as csvfile:
			writer = csv.DictWriter(csvfile, fieldnames=list(classes))	     
			writer.writeheader()
			for each in keyword_stats:
				writer.writerow(each)
    	prec,rec = tp/(tp+fp+0.01),tp/(tp+fn+0.01)
    	#print prec,rec
    	print '\n'
    	print '[tp,fp,tn,fn]',keyword_stats[-1]
    	print '------tp------tn------fp------fn------pr-------re------f1------'
    	print '----------------------------Model %s--------------------------' %fname
    	print '------%d------%d------%d------%d------%.2f------%.2f------%.2f------' %(tp,tn,fp,fn,prec,rec,2*prec*rec/(prec+rec+0.01))
    	x= '%s \n' %confusion[-1]
    	for each in confusion[-1]:
    		x+= 'confusion in %s \n' %each    		
    		for no in confusion[-1][each]:     			
    			for examp in  no[:3]:
    				x+= '%s \n' %examp
    				x+= '%s %s\n' %(dev_samples[examp],gold[examp])
    		x+= '---------------------------------\n'
		writeintofile(x,"confusion")
Exemplo n.º 29
0
import sys
import load
from normalize import normalize_features
from print_matrices import print_matrices
from numeric_verification import verify
import train

if len(sys.argv) >= 4:
    network_filename = sys.argv[1]
    weights_filename = sys.argv[2]
    dataset_name = sys.argv[3]
else:
    print("\nUsage:\t python backpropagation.py network weights dataset\n")
    sys.exit()

dataset = load.load_dataset(dataset_name)
network = load.load_network_structure(network_filename)
initial_weights = load.load_weights(weights_filename)

normalize_features(dataset)

# Calcula gradientes usando todas as instâncias do dataset.
gradients = train.calculate_gradients(dataset, 0, len(dataset),
                                      initial_weights,
                                      network['regularization'])

print_matrices(gradients)
def test_iris():
    features, labels = load_dataset('iris')
    assert len(features[0]) == 4
    assert len(features)
    assert len(features) == len(labels)
Exemplo n.º 31
0
import scipy.stats as sst
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from tensorflow import keras
import sklearn.metrics as skm

import numpy_encoder

default_lr = 0.001
address_request_round = "http://127.0.0.1:8000/round"
address_global_weight = "http://127.0.0.1:8000/weight"
global_round = 0
current_round = 0
max_round = 100
delay_time = 15

train = load.load_dataset("data/train_2.json")
val = load.load_dataset("data/validation_2.json")
preproc = load.preproc(*train)

train_x, train_y = preproc.process(*train)
val_x, val_y = preproc.process(*val)

print("train size : {}, {}".format(len(train_x), len(train_y)))
print("val size : {}, {}".format(len(val_x), len(val_y)))

with open("data/validation_2.json", "rb") as fid:
    val_labels = [json.loads(l)['labels'] for l in fid]

counts = collections.Counter(preproc.class_to_int[l[0]] for l in val_labels)
counts = sorted(counts.most_common(), key=lambda x: x[0])
counts = list(zip(*counts))[1]
Exemplo n.º 32
0
import simfin as sf
from load import load_dataset, load_shareprices
import pathlib
import os
from dotenv import load_dotenv
from predict import train, predict, predict_similiar

load_dotenv()
SIMFIN_API_KEY = os.getenv('SIMFIN_API_KEY', 'free')
MODELS_DIR = pathlib.Path('./models')
DATA_DIR = pathlib.Path('./data')

# LOAD
shareprices_df = load_shareprices(simfin_api_key=SIMFIN_API_KEY)
general_df = load_dataset(dataset='general',
                          simfin_api_key=SIMFIN_API_KEY,
                          shareprices_df=shareprices_df)
banks_df = load_dataset(dataset='banks',
                        simfin_api_key=SIMFIN_API_KEY,
                        shareprices_df=shareprices_df)
insurance_df = load_dataset(dataset='insurance',
                            simfin_api_key=SIMFIN_API_KEY,
                            shareprices_df=shareprices_df)

# TRAIN
general_model = train(general_df,
                      winsor_quantile=0.01,
                      model_name='general_model',
                      feature_name='general',
                      param=dict(learning_rate=0.01,
                                 max_depth=3,
Exemplo n.º 33
0
from matplotlib.colors import ListedColormap

from  main.ch02.utils import CHART_DIR
import load

feature_names = [
    'area',
    'perimeter',
    'compactness',
    'length of kernel',
    'width of kernel',
    'asymmetry coefficien',
    'length of kernel groove',
]

data = load.load_dataset("seeds")
# np.set_printoptions(threshold=np.nan)

def drawFigure(features, labels, neighbors=1, parameters=[], figName="no_name"):
    names = sorted(set(labels))
    labels = np.array([names.index(ell) for ell in labels])

    idX, idY = parameters[0], parameters[1]

    print("Xaxis :{0} - Yaxis : {1}").format(idX, idY)

    # define lower and upper limit on both axis (x=area, y =compactness)
    x0, y0 = features[:, idX].min() * 0.9, features[:, idY].min() * 0.9
    x1, y1 = features[:, idX].max() * 1.1, features[:, idY].max() * 1.1

    # create a meshgrid resulting of 2 X/Y-Linespaces
Exemplo n.º 34
0
def train_AR(data_folder, data_fname, save_folder):
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch GLOW')

    parser.add_argument('--epochs',
                        type=int,
                        default=2000,
                        help='number of epochs to train (default: 1000)')

    parser.add_argument('--lr',
                        type=float,
                        default=1e-5,
                        help='learning rate (default: 0.0001)')

    parser.add_argument('--num-inputs',
                        type=int,
                        default=24,
                        help='look-ahead horizon of forecasting')

    parser.add_argument('--num-cond-inputs',
                        type=int,
                        default=24,
                        help='length of historical data')

    parser.add_argument('--order',
                        type=int,
                        default=24,
                        help='order of Autoregressive model')

    parser.add_argument('--delta',
                        type=int,
                        default=1e-4,
                        help='stopping criterion')

    args = parser.parse_args()

    try:
        os.makedirs(save_folder)
    except OSError:
        pass

    # Define ARModel class
    class ARModel(nn.Module):
        def __init__(self, input_dim, output_dim):

            super(ARModel, self).__init__()
            self.linear = nn.Linear(input_dim, output_dim)

        def forward(self, x):
            out = self.linear(x)
            return out

    input_dim = args.num_cond_inputs
    output_dim = 1
    model = ARModel(input_dim, output_dim)

    # Load dataset
    training_subset, valid_set, test_set = load.load_dataset(
        data_folder, data_fname)
    print('Training subset size:', training_subset.N)
    print('Validation set size:', valid_set.N)
    print('Test set size:', test_set.N)

    # Transform to torch.Tensor
    train_tensor = torch.from_numpy(training_subset.X)
    train_labels = torch.from_numpy(training_subset.y)
    train_dataset = torch.utils.data.TensorDataset(train_tensor, train_labels)

    valid_tensor = torch.from_numpy(valid_set.X)
    valid_labels = torch.from_numpy(valid_set.y)
    valid_dataset = torch.utils.data.TensorDataset(valid_tensor, valid_labels)

    test_tensor = torch.from_numpy(test_set.X)
    test_labels = torch.from_numpy(test_set.y)
    test_dataset = torch.utils.data.TensorDataset(test_tensor, test_labels)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=1,
                                               shuffle=False)

    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=1,
                                               shuffle=False,
                                               drop_last=False)

    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=1,
                                              shuffle=False,
                                              drop_last=False)
    # Define loss function and optimizer
    criterion = nn.MSELoss()  # Mean Squared Loss
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr)  #Stochastic Gradient Descent

    # Train AR
    train_loss = []

    def train(epoch):
        model.train()
        for batch_idx, data in enumerate(train_loader):
            if isinstance(data, list):
                if len(data) > 1:
                    cond_data = data[1].squeeze()
                else:
                    cond_data = None

                input_data = data[0].squeeze()
            optimizer.zero_grad()

            num_inputs = input_data.shape[0]
            history = cond_data
            pred = []
            for t in range(num_inputs):
                yt_hat = model.forward(history)
                history = torch.cat((history, yt_hat))
                history = history[-24:]
                if t == 0:
                    pred = yt_hat

                else:
                    pred = torch.cat((pred, yt_hat))

            loss = criterion(pred, input_data)
            train_loss.append(loss.detach().item())

            loss.backward()
            optimizer.step()

    def validate(epoch, model, valid_loader):
        model.eval()
        val_loss = 0

        for batch_idx, data in enumerate(valid_loader):
            if isinstance(data, list):
                if len(data) > 1:
                    cond_data = data[1].squeeze()
                else:
                    cond_data = None

                input_data = data[0].squeeze()

            with torch.no_grad():
                history = cond_data
                pred = []
                for t in range(input_data.shape[0]):
                    yt_hat = model.forward(history)
                    history = torch.cat((history, yt_hat))
                    history = history[-24:]
                    if t == 0:
                        pred = yt_hat

                    else:
                        pred = torch.cat((pred, yt_hat))
                val_loss += criterion(pred, input_data).detach().item()

        return val_loss / valid_set.N

    best_validation_loss = float('inf')
    best_validation_epoch = 0
    best_model = model

    valid_loss = []
    for epoch in range(args.epochs):
        print('\nEpoch: {}'.format(epoch))

        train(epoch)

        validation_loss = validate(epoch, model, valid_loader)
        valid_loss.append(validation_loss)

        if epoch - best_validation_epoch >= 10:
            break

        if validation_loss < best_validation_loss - args.delta:
            best_validation_epoch = epoch
            best_validation_loss = validation_loss
            best_model = copy.deepcopy(model)

        print('Best validation at epoch {}: Average mse: {:.4f}'.format(
            best_validation_epoch, best_validation_loss))

    plt.figure(figsize=(10, 10))
    plt.plot(range(len(valid_loss)), valid_loss)
    plt.title('validation loss over epochs')
    plt.savefig(save_folder + 'valid_loss.png')

    # Save trained model
    torch.save(best_model.state_dict(), save_folder + 'best_model.pt')

    #torch.save(best_model, save_folder+'best_model.pt')

    # Adding Gaussian noise to AR's point estimates to have scenarios
    def test(model, test_loader):
        model.eval()
        predictions = []
        test_data = []
        for index, data in enumerate(test_loader):
            #if index == 2: break
            input_data = data[0].squeeze()
            cond_data = data[1].squeeze()

            with torch.no_grad():
                history = cond_data
                pred = []
                for t in range(input_data.shape[0]):
                    yt_hat = model.forward(history)
                    history = torch.cat((history, yt_hat))
                    history = history[-24:]

                    if t == 0:
                        pred = yt_hat
                    else:
                        pred = torch.cat((pred, yt_hat))

                if index == 0:
                    predictions = pred
                    test_data = input_data
                else:
                    predictions = torch.cat((predictions, pred))
                    test_data = torch.cat((test_data, input_data))

        return predictions, test_data

    predictions, test_data = test(best_model, test_loader)
    res = predictions - test_data
    res_std = res.std()
    noise = np.random.normal(0, res_std.item(), (5000, predictions.shape[0]))
    predictions = predictions.numpy()
    test_data = test_data.numpy()
    scenarios = predictions + noise

    def calculate_dist(true, generated):
        distance = {}
        for t in range(generated.shape[1]):

            y = true[t]
            y_hat = generated[:, t]

            dist = []
            for p in range(50, 101, 1):
                if p == 50:
                    median = stats.scoreatpercentile(y_hat, p)
                    dist.append(np.abs(y - median))
                else:
                    pl = 100 - p
                    pu = p
                    l = stats.scoreatpercentile(y_hat, pl)
                    u = stats.scoreatpercentile(y_hat, pu)

                    if y <= u and y >= l:
                        dist.append(0.0)
                    elif y < l:
                        dist.append(np.abs(y - l))
                    else:
                        dist.append(np.abs(y - u))

            # distance for each hour t
            dist = np.array(dist)
            distance[t] = dist

        series = pd.DataFrame.from_dict(distance)
        series = series.mean(axis=1)

        return series.values

    distance = {}
    start = 0
    stride = 24
    for sample_index in range(test_set.N):
        if start + stride <= scenarios.shape[1]:
            generated = scenarios[:, start:start + stride]
            true = test_data[start:start + stride]
            start += stride
            # Accumulate over samples
            distance[sample_index] = calculate_dist(true, generated)

    #distance = distance/test_set.N
    #np.save(save_folder+'AR_distance.npy',distance)
    AR_distance = pd.DataFrame.from_dict(distance)
    AR_distance.to_csv(save_folder + 'AR_distance.csv')

    AR_pred_dict = {}
    AR_pred_dict['point_pred'] = predictions
    AR_pred_dict['ground_truth'] = test_data
    AR_pred_dict['median'] = stats.scoreatpercentile(scenarios, 50, axis=0)
    AR_pred_dict['pi1'] = stats.scoreatpercentile(scenarios, 1, axis=0)
    AR_pred_dict['pi99'] = stats.scoreatpercentile(scenarios, 99, axis=0)
    AR_pred_dict['pi5'] = stats.scoreatpercentile(scenarios, 5, axis=0)
    AR_pred_dict['pi95'] = stats.scoreatpercentile(scenarios, 95, axis=0)
    AR_pred_dict['pi15'] = stats.scoreatpercentile(scenarios, 15, axis=0)
    AR_pred_dict['pi85'] = stats.scoreatpercentile(scenarios, 85, axis=0)
    AR_pred_dict['pi25'] = stats.scoreatpercentile(scenarios, 25, axis=0)
    AR_pred_dict['pi75'] = stats.scoreatpercentile(scenarios, 75, axis=0)

    AR_pred = pd.DataFrame.from_dict(AR_pred_dict)
    AR_pred.to_csv(save_folder + 'AR_pred.csv')
Exemplo n.º 35
0
# nearest neighbor classification
#  When classifying a new element, this looks at the training data. For the object that is closest to it, its nearest neighbor. Then, it returns its label as the answer.

import load
from load import load_dataset
feature_names = [
    'area',
    'perimeter',
    'compactness',
    'length of kernel',
    'width of kernel',
    'asymmetry coefficient',
    'length of kernel groove',
]

data = load_dataset('seeds')
features = data['features']
target = data['target']


from sklearn.neighbors import KNeighborsClassifier
from matplotlib.colors import ListedColormap
knn = KNeighborsClassifier(n_neighbors=1)

kf = model_selection.KFold(n_splits=5, shuffle=False)
means = []
for training, testing in kf.split(features):
    knn.fit(features[training], target[training])
    prediction = knn.predict(features[testing])

    curmean = np.mean(prediction  == target[testing])
Exemplo n.º 36
0
        plt.xlabel('iterations (per fives)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

        parameters = sess.run(parameters)
        print("Parameters have been trained!")

        correct_prediction = tf.equal(tf.argmax(Z3), tf.argmax(Y))

        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

        print("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train}))
        print("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test}))

        return parameters


X_train_orig, Y_train_orig, X_test_orig, Y_test_orig, classes = load.load_dataset(
)

X_train_flatten = X_train_orig.reshape(X_train_orig.shape[0], -1).T
X_test_flatten = X_test_orig.reshape(X_test_orig.shape[0], -1).T

X_train = X_train_flatten / 255.
X_test = X_test_flatten / 255.

Y_train = tools.convert_to_one_hot(Y_train_orig, 6)
Y_test = tools.convert_to_one_hot(Y_test_orig, 6)

parameters = model(X_train, Y_train, X_test, Y_test)
        np.vstack([X.ravel(), Y.ravel()]).T, model).reshape(X.shape)
    if COLOUR_FIGURE:
        cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
    else:
        cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)])
    plt.xlim(x0, x1)
    plt.ylim(y0, y1)
    plt.xlabel(feature_names[0])
    plt.ylabel(feature_names[2])
    plt.pcolormesh(X, Y, C, cmap=cmap)
    if COLOUR_FIGURE:
        cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)])
        plt.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
    else:
        for lab, ma in zip(range(3), "Do^"):
            plt.plot(features[labels == lab, 0], features[
                     labels == lab, 2], ma, c=(1., 1., 1.))


features, labels = load_dataset('seeds')
names = sorted(set(labels))
labels = np.array([names.index(ell) for ell in labels])

train_plot(features, labels)
plt.savefig('figure4.png')

features -= features.mean(0)
features /= features.std(0)
train_plot(features, labels)
plt.savefig('figure5.png')
Exemplo n.º 38
0
import util
import keras
import keras.backend as K
from keras.callbacks import LearningRateScheduler
from keras.models import Model
import scipy.io as scio
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

MAX_EPOCHS = 160
batch_size = 32
if __name__ == '__main__':
    params = util.config()
    save_dir = params['save_dir']

    print("Loading training set...")
    train = load.load_dataset(params['train'])
    print("Loading dev set...")
    dev = load.load_dataset(params['dev'])
    print("Building preprocessor...")
    preproc = load.Preproc(*train)
    print("Training size: " + str(len(train[0])) + " examples.")
    print("Dev size: " + str(len(dev[0])) + " examples.")

    params.update({
        "input_shape": [8000, 1],
        "num_categories": len(preproc.classes)
    })

    #create the cl-pcg-net
    model = network.build_network(**params)
Exemplo n.º 39
0
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License

# Basic imports
from __future__ import print_function
import numpy as np
from load import load_dataset

# Import sklearn implementation of KNN
from sklearn.neighbors import KNeighborsClassifier

features, labels = load_dataset('aaj_data')
classifier = KNeighborsClassifier(n_neighbors=4)

n = len(features)
correct = 0.0
for ei in range(n):
    training = np.ones(n, bool)
    training[ei] = 0
    testing = ~training
    classifier.fit(features[training], labels[training])
    pred = classifier.predict(features[ei])
    correct += (pred == labels[ei])
print('Result of leave-one-out: {}'.format(correct / n))

# Import KFold object
from sklearn.cross_validation import KFold
Exemplo n.º 40
0
def predict(parser):
    val = load.load_dataset("data/validation_2.json")
    preproc = load.preproc(*val)

    args = parser.parse_args()
    print("args model : ", args.model)

    model = architecture.build_model()
    model.load_weights(args.model)

    with open("data/validation_2.json", "rb") as fid:
        val_labels = [json.loads(l)['labels'] for l in fid]

    counts = collections.Counter(preproc.class_to_int[l[0]]
                                 for l in val_labels)
    counts = sorted(counts.most_common(), key=lambda x: x[0])
    counts = list(zip(*counts))[1]

    print("counts : ", counts)

    smooth = 500
    counts = np.array(counts)[None, None, :]
    total = np.sum(counts) + counts.shape[1]
    print("total : ", total)
    prior = (counts + smooth) / float(total)  # ???
    print("prior : ", prior)

    ecgs, committee_labels = preproc.process(*val)
    m_probs = model.predict(ecgs)

    committee_labels = np.argmax(committee_labels, axis=2)
    committee_labels = committee_labels[:, 0]

    print("===================")
    temp = []
    preds = np.argmax(m_probs / prior, axis=2)
    for i, j in zip(preds, val_labels):
        t = sst.mode(i[:len(j) - 1])[0][0]
        temp.append(t)
        #print(i[:len(j)-1])

    preds = temp

    #print("preds : \n", preds)

    report = skm.classification_report(committee_labels,
                                       preds,
                                       target_names=preproc.classes,
                                       digits=3)
    scores = skm.precision_recall_fscore_support(committee_labels,
                                                 preds,
                                                 average=None)
    print("report : \n", report)

    cm = confusion_matrix(committee_labels, preds)
    print("confusion matrix : \n", cm)

    f1 = f1_score(committee_labels, preds, average='micro')
    #print("f1_score : ", f1)

    # ***roc_auc_score - m_probs***
    s_probs = np.sum(m_probs, axis=1)
    s_probs = s_probs / 71  # one data set max size (element count) -> normalization

    #ovo_auroc = roc_auc_score(committee_labels, s_probs, multi_class='ovo')
    ovr_auroc = roc_auc_score(committee_labels, s_probs, multi_class='ovr')

    print("ovr_auroc : ", ovr_auroc)
    #print("ovo_auroc : ", ovo_auroc)
    '''
        bootstrapping
    '''
    n_bootstraps = 100
    np.random.seed(3033)

    total_precision = []
    total_recall = []
    total_f1 = []
    total_auroc = []

    precision = []
    recall = []
    f1 = []

    total = []

    for j in range(n_bootstraps):
        indices = np.random.random_integers(0, len(m_probs) - 1, 100)

        #print("indices : ", len(indices))

        if len(np.unique(committee_labels[indices])) < 2:
            continue

        sub_labels = []
        sub_result = []
        sub_probs = []

        #print(indices)

        for i in indices:
            sub_labels.append(committee_labels[i])
            sub_result.append(preds[i])
            sub_probs.append(m_probs[i])

        s_scores = precision_recall_fscore_support(sub_labels,
                                                   sub_result,
                                                   labels=[0, 1, 2, 3],
                                                   average=None)

        # ***roc_auc_score - m_probs***
        s_p = np.sum(sub_probs, axis=1)
        s_p = s_p / 71  # one data set max size (element count) -> normalization

        # ovo_auroc = roc_auc_score(committee_labels, s_probs, multi_class='ovo')
        #print(sub_labels)
        #print(s_p)

        try:
            s_auroc = roc_auc_score(sub_labels, s_p, multi_class='ovr')
        except:
            s_auroc = -1

        #print(s_scores)
        precision.append(np.array(s_scores[0]))
        recall.append(np.array(s_scores[1]))
        f1.append(np.array(s_scores[2]))
        #auroc.append(s_auroc)

        total_precision.append(np.average(s_scores[0]))
        total_recall.append(np.average(s_scores[1]))
        total_f1.append(np.average(s_scores[2]))
        total_auroc.append(s_auroc)

    total_precision.sort()
    total_recall.sort()
    total_f1.sort()
    total_auroc.sort()

    total_auroc.remove(-1)
    #print(total_auroc)
    '''
        bootstrapping 시 클래스가 존재하지 않는 케이스가 있을수도 있음 
    '''
    precision = np.array(precision)
    precision[precision == .0] = np.nan
    recall = np.array(recall)
    recall[recall == .0] = np.nan
    f1 = np.array(f1)
    f1[f1 == .0] = np.nan

    #print(total_auroc)

    for i in range(4):
        pre = precision[:, i]
        pre.sort()
        rec = recall[:, i]
        rec.sort()
        f = f1[:, i]
        f.sort()

        pre = np.round(pre[int(len(pre) * 0.025):int(len(pre) * 0.975)], 3)
        rec = np.round(rec[int(len(rec) * 0.025):int(len(rec) * 0.975)], 3)
        f = np.round(pre[int(len(f) * 0.025):int(len(f) * 0.975)], 3)
        '''
        print(i,
              " : ", "{0} ({1}, {2})".format(np.round(np.nanmean(pre), 3), round(pre[0], 3), round(pre[-1], 3)),
              " : ", "{0} ({1}, {2})".format(np.round(np.nanmean(rec), 3), round(rec[0], 3), round(rec[-1], 3)),
              " : ", "{0} ({1}, {2})".format(np.round(np.nanmean(f), 3), round(f[0], 3), round(f[-1], 3)))
        '''

        item = [
            i, "{0} ({1}, {2})".format(np.round(np.nanmean(pre), 3),
                                       round(np.nanmin(pre), 3),
                                       round(np.nanmax(pre), 3)),
            "{0} ({1}, {2})".format(np.round(np.nanmean(rec), 3),
                                    round(np.nanmin(rec), 3),
                                    round(np.nanmax(rec), 3)),
            "{0} ({1}, {2})".format(np.round(np.nanmean(f), 3),
                                    round(np.nanmin(f), 3),
                                    round(np.nanmax(f), 3))
        ]

        total.append(item)

    total_auroc = np.round(
        total_auroc[int(len(total_auroc) *
                        0.025):int(len(total_auroc) * 0.975)], 3)
    total_precision = np.round(
        total_precision[int(len(total_precision) *
                            0.025):int(len(total_precision) * 0.975)], 3)
    total_recall = np.round(
        total_recall[int(len(total_recall) *
                         .025):int(len(total_recall) * .975)], 3)
    total_f1 = np.round(
        total_f1[int(len(total_f1) * .025):int(len(total_f1) * .975)], 3)

    with open(args.file_name, "w", newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["", "precision", "recall", "f1-score", "auroc"])
        writer.writerow([
            "",
            "{0} ({1}, {2})".format(np.round(np.average(scores[0]), 3),
                                    total_precision[0], total_precision[-1]),
            "{0} ({1}, {2})".format(np.round(np.average(scores[1]), 3),
                                    total_recall[0], total_recall[-1]),
            "{0} ({1}, {2})".format(np.round(np.average(scores[2]), 3),
                                    total_f1[0], total_f1[-1]),
            "{0} ({1}, {2})".format(np.round(ovr_auroc, 3), total_auroc[0],
                                    total_auroc[-1]),
        ])
        for i in total:
            writer.writerow(i)
def test_seeds():
    features, labels = load_dataset('seeds')
    assert len(features[0]) == 7
    assert len(features)
    assert len(features) == len(labels)
Exemplo n.º 42
0
						pred[-1].append(everyclassifyer);
					
			#print 'Tags for this one', pred
			#assert(False)
		print pred
		return pred;



if __name__ == '__main__':

	#A refined form of what we are doing looks so much similar to LDA/PGM.
	#Deep learning to learn feature end-end is better

	print 'Begin Loading samples...'
	train_samples,train_target = load.load_dataset(fname=load.filename['TRAIN'],numdocs=None);
	print 'number of training sample %d'  %len(train_target)
	print 'Tags for the last train example',train_target[-1]
	c=defaultdict(float)
	for each in train_target:
		for everytag in each:
			c[everytag]+=1;
	
	y = filter(lambda x: c[x]>=500.0 ,c.keys());
#y=['java']
	print y
	
	M1 = search_classify(True,y,'bow_bigram');
	M1.train(train_target,train_samples);
	eval.evaluate([(M1,u'tfidf_LR.csv')],set(M1.classifyers.keys()));