예제 #1
0
        header = ret["header"]
        X = common_utils.extract_features_from_rawdata(chunk, header,
                                                       args.period,
                                                       args.features)
        Xs.append(X)
        ys += y
    Xs = np.concatenate(Xs, axis=0)
    return (Xs, ys)


print "==> reading data and extracting features"
#chunk_size = 8000  # TODO: bigger chunk_size

(train_X,
 train_y) = read_and_extract_features(train_reader,
                                      train_reader.get_number_of_examples())
del train_reader

(val_X, val_y) = read_and_extract_features(val_reader,
                                           val_reader.get_number_of_examples())
del val_reader

(test_X,
 test_y) = read_and_extract_features(test_reader,
                                     test_reader.get_number_of_examples())
del test_reader

print "==> imputing missing values"
# imput missing values
imputer = Imputer(missing_values=np.nan,
                  strategy='mean',
예제 #2
0
        process_one_chunk("train", chunk_index)
        cnt_trained = chunk_index - n_trained_chunks + 1

        if (cnt_trained % 5 == 0):
            val_loss = process_one_chunk("test", chunk_index)
            if ((cnt_trained / 5) % args.save_every == 0):
                state_name = 'states/%s.chunk%d.test%.8f.state' % (
                    network_name, chunk_index, val_loss)

                print "==> saving ... %s" % state_name
                network.save_params(state_name, chunk_index)

        print "chunk %d took %.3fs" % (chunk_index,
                                       float(time.time()) - start_time)

        chunks_per_epoch = train_reader.get_number_of_examples() // chunk_size
        if (cnt_trained % chunks_per_epoch == 0):
            train_reader.random_shuffle()
            val_reader.random_shuffle()

elif args.mode == 'test':
    # ensure that the code uses test_reader
    del train_reader
    del val_reader

    test_reader = DecompensationReader(
        dataset_dir='../../data/decompensation/test/',
        listfile='../../data/decompensation/test_listfile.csv')

    n_batches = test_reader.get_number_of_examples() // args.batch_size
    y_true = []
예제 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1']
    # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001]
    penalties = ['l2']
    Cs = [0.001]

    train_reader = DecompensationReader(dataset_dir='../../../data/decompensation/train/',
                                        listfile='../../../data/decompensation/train_listfile.csv')

    val_reader = DecompensationReader(dataset_dir='../../../data/decompensation/train/',
                                      listfile='../../../data/decompensation/val_listfile.csv')

    test_reader = DecompensationReader(dataset_dir='../../../data/decompensation/test/',
                                       listfile='../../../data/decompensation/test_listfile.csv')

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_names, train_ts) = read_and_extract_features(
        train_reader, n_train, args.period, args.features)

    (val_X, val_y, val_names, val_ts) = read_and_extract_features(
        val_reader, n_val, args.period, args.features)

    (test_X, test_y, test_names, test_ts) = read_and_extract_features(
        test_reader, test_reader.get_number_of_examples(), args.period, args.features)

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    common_utils.create_directory('results')

    for (penalty, C) in zip(penalties, Cs):
        file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C)

        logreg = LogisticRegression(penalty=penalty, C=C, random_state=42)
        logreg.fit(train_X, train_y)

        with open(os.path.join('results', 'train_{}.json'.format(file_name)), "w") as res_file:
            ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, res_file)

        with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file:
            ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, res_file)

        prediction = logreg.predict_proba(test_X)[:, 1]

        with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file:
            ret = print_metrics_binary(test_y, prediction)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, res_file)

        save_results(test_names, test_ts, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
예제 #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1']
    # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001]
    penalties = ['l2']
    Cs = [0.001]

    train_reader = DecompensationReader(
        dataset_dir='../../../data/decompensation/train/',
        listfile='../../../data/decompensation/train_listfile.csv')

    val_reader = DecompensationReader(
        dataset_dir='../../../data/decompensation/train/',
        listfile='../../../data/decompensation/val_listfile.csv')

    test_reader = DecompensationReader(
        dataset_dir='../../../data/decompensation/test/',
        listfile='../../../data/decompensation/test_listfile.csv')

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_names,
     train_ts) = read_and_extract_features(train_reader, n_train, args.period,
                                           args.features)

    (val_X, val_y, val_names,
     val_ts) = read_and_extract_features(val_reader, n_val, args.period,
                                         args.features)

    (test_X, test_y, test_names,
     test_ts) = read_and_extract_features(test_reader,
                                          test_reader.get_number_of_examples(),
                                          args.period, args.features)

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    common_utils.create_directory('results')

    for (penalty, C) in zip(penalties, Cs):
        file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                          C)

        logreg = LogisticRegression(penalty=penalty, C=C, random_state=42)
        logreg.fit(train_X, train_y)

        with open(os.path.join('results', 'train_{}.json'.format(file_name)),
                  "w") as res_file:
            ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, res_file)

        with open(os.path.join('results', 'val_{}.json'.format(file_name)),
                  'w') as res_file:
            ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, res_file)

        prediction = logreg.predict_proba(test_X)[:, 1]

        with open(os.path.join('results', 'test_{}.json'.format(file_name)),
                  'w') as res_file:
            ret = print_metrics_binary(test_y, prediction)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, res_file)

        save_results(test_names, test_ts, prediction, test_y,
                     os.path.join('predictions', file_name + '.csv'))
    # ensure that the code uses test_reader
    del train_data_gen
    del val_data_gen

    if args.deep_supervision:
        del train_data_loader
        del val_data_loader
    else:
        del train_reader
        del val_reader

    test_reader = DecompensationReader(
        dataset_dir='../../data/decompensation/test/',
        listfile='../../data/decompensation/test_listfile.csv')

    test_nbatches = test_reader.get_number_of_examples() // args.batch_size
    test_nbatches = 10000
    test_data_gen = utils.BatchGen(test_reader, discretizer, normalizer,
                                   args.batch_size, test_nbatches)
    labels = []
    predictions = []
    for i in range(test_nbatches):
        print "\rpredicting {} / {}".format(i, test_nbatches),
        x, y = next(test_data_gen)
        x = np.array(x)
        pred = model.predict_on_batch(x)[:, 0]
        predictions += list(pred)
        labels += list(y)

    metrics.print_metrics_binary(labels, predictions)
    with open("activations.txt", "w") as fout:
예제 #6
0
    Xs = np.concatenate(Xs, axis=0)
    return (Xs, ys)


print "==> reading data and extracting features"
chunk_size = 100000  # TODO: bigger chunk_size

(train_X, train_y) = read_and_extract_features(train_reader, chunk_size)
del train_reader

(val_X, val_y) = read_and_extract_features(val_reader, chunk_size)
del val_reader

(test_X,
 test_y) = read_and_extract_features(test_reader,
                                     test_reader.get_number_of_examples())
del test_reader

print "==> imputing missing values"
# imput missing values
imputer = Imputer(missing_values=np.nan,
                  strategy='mean',
                  axis=0,
                  verbose=0,
                  copy=True)
imputer.fit(train_X)
train_X = np.array(imputer.transform(train_X), dtype=np.float32)
val_X = np.array(imputer.transform(val_X), dtype=np.float32)
test_X = np.array(imputer.transform(test_X), dtype=np.float32)

print "==> normalizing data"