Exemplo n.º 1
0
    labels = []
    unencoded_data = []
    for line in text.splitlines():
        splitted = line.split(",")
        assert(len(splitted) == 35)

        missing_val = False
        for attr in splitted:
            if attr == '?':
                missing_val = True
                break
        if missing_val:
            continue

        labels.append(int(splitted[34]))
        unencoded_data.append(splitted[:34])

    data = enc.fit_transform(unencoded_data).toarray()
    print(data.shape)
    return (data, np.array(labels))

if __name__ == "__main__":
    (data, labels) = parse_dermatology("../data/dermatology/dermatology.data")
    print(data[:10])
    print(labels[:10])
    (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels)
    utils.save_protobuf(train_data, train_labels, "dermatology_train")
    utils.save_protobuf(test_data, test_labels, "dermatology_test")

Exemplo n.º 2
0
dst_host_serror_rate: continuous.
dst_host_srv_serror_rate: continuous.
dst_host_rerror_rate: continuous.
dst_host_srv_rerror_rate: continuous.
"""

def parse_kddcup(fp):
    all_cols = []
    numeric_cols = []
    nominal_cols = []
    label_col = ['label']
    for line in raw_name.splitlines()[1:]:
        col = line.split(":")[0]
        col_type = line.split(":")[1][1:-1]
        if (col_type == 'continuous'):
            numeric_cols.append(col)
        elif (col_type == 'symbolic'):
            nominal_cols.append(col)
        else:
            assert(False)
        all_cols.append(col)

    df = pandas.read_csv(fp, names=all_cols+label_col)
    return utils.parse_data_with_pandas(df, [], numeric_cols, label_col, nominal_cols)

if __name__ == "__main__":
    data, labels = parse_kddcup("../data/kddcup/kddcup.data_10_percent")
    (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels, 0.1)
    utils.save_protobuf(train_data, train_labels, "kddcup_train")
    utils.save_protobuf(test_data, test_labels, "kddcup_test")
Exemplo n.º 3
0
        num_correct += torch.sum(torch.eq(preds, genders_batch))
        total += preds.size(0)

        epoch_end = time.time()
        print("Epoch %d took %s" %
              (epoch, utils.sec2str(int(epoch_end - epoch_start))))

    print("acc: ", float(num_correct) / float(total))
    end = time.time()
    print(utils.sec2str(int(end - start)))


if __name__ == "__main__":
    device = "cuda:0"
    with torch.no_grad():
        # vgg16 = load_face_model("../../caffemodel2pytorch/gender.caffemodel.pt").to(device)
        vgg16 = torchvision.models.vgg16(pretrained=True)
        vgg16 = expose_last_fc(vgg16).to(device)

        # evaluate_model(128, 10000, device)
        data, labels = get_features(128, 1024, vgg16, device)
        # normalize each feature to be [0,1)
        data = data - data.min(dim=0)[0]
        data = data / (data.max(dim=0)[0] + 1e-6)
        data = data.to('cpu').numpy()
        labels = labels.to('cpu').numpy()
        (train_data, train_labels, test_data,
         test_labels) = utils.split_train_test(data, labels, 0.1)
        utils.save_protobuf(train_data, train_labels, "imdb_train")
        utils.save_protobuf(test_data, test_labels, "imdb_test")
Exemplo n.º 4
0
    'priority': 3,
    'spec_prior': 4,
        }

def parse_nursery(fp):
    with open(fp) as f:
        text = f.read()
    
    enc = OneHotEncoder(handle_unknown='error')

    unencoded_data = []
    labels = []
    for line in text.splitlines():
        splitted = line.split(",")
        assert(len(splitted) == 9)
        unencoded_data.append(splitted[:-1])
        labels.append(label_map[splitted[8]])

    data = enc.fit_transform(unencoded_data).toarray()
    assert(data.shape == (len(labels), 3+5+4+4+3+2+3+3))
    return (data, np.array(labels))
    
if __name__ == '__main__':
    (data, labels) = parse_nursery('../data/nursery/nursery.data')
    print(data[:10])
    print(labels[:10])
    (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels)
    utils.save_protobuf(train_data, train_labels, "nursery_train")
    utils.save_protobuf(test_data, test_labels, "nursery_test")

Exemplo n.º 5
0
import pandas
import utils
import numpy as np


def parse_creditcard(fp):
    df = pandas.read_excel(fp, sheet_name='Data')
    df = df[1:]
    drop_cols = []
    numeric_cols = ["X%d" % i for i in range(1, 24)]
    label_col = ["Y"]
    return utils.parse_data_with_pandas(df, drop_cols, numeric_cols, label_col,
                                        [])


if __name__ == "__main__":
    data, labels = parse_creditcard('../data/creditcard/creditcard.xls')
    (train_data, train_labels, test_data,
     test_labels) = utils.split_train_test(data, labels)
    utils.save_protobuf(train_data, train_labels, "creditcard_train")
    utils.save_protobuf(test_data, test_labels, "creditcard_test")
Exemplo n.º 6
0
import utils
import numpy as np
import struct


def parse_images(fp):
    with open(fp, "rb") as f:
        magic, size = struct.unpack(">II", f.read(8))
        nrows, ncols = struct.unpack(">II", f.read(8))
        data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder(">"))
        data = data.reshape((size, nrows * ncols)).astype(float)
        return data


def parse_labels(fp):
    with open(fp, "rb") as f:
        magic, size = struct.unpack(">II", f.read(8))
        data = np.fromfile(f, dtype=np.dtype(
            np.uint8).newbyteorder(">")).astype(int)
        return data


if __name__ == "__main__":
    train_data = parse_images("../raw_data/mnist/mnist500k-patterns")
    train_labels = parse_labels("../raw_data/mnist/mnist500k-labels")
    utils.save_protobuf(train_data, train_labels, "mnist500k_train")

    # test_data = parse_images("../data/mnist/test10k-patterns")
    # test_labels = parse_labels("../data/mnist/test10k-labels")
    # utils.save_protobuf(test_data, test_labels, "mnist60k_test")
Exemplo n.º 7
0
    df = df.drop(drop_cols, axis=1)
    numeric_df = df[numeric_cols]
    label_df = df[label_col]
    nominal_df = df.drop(numeric_cols + label_col, axis=1)

    numeric = np.array(numeric_df.as_matrix())
    nominal = OneHotEncoder().fit_transform(nominal_df.as_matrix()).toarray()
    labels = np.array(LabelEncoder().fit_transform(label_df.as_matrix().ravel()))

    # first 8 columns are numeric, and the rest are nominal for total of 155 columns
    # 0: 1-14
    # 1: 0-6
    # 2: 1-132
    # 3: 1-81
    # 4: 0-42
    # 5: 0-76
    # 6: 0-21
    # 7: 1-16
    data = np.concatenate((numeric, nominal), axis=1)
    print("data shape: ", data.shape)
    print("labels shape: ", labels.shape)
    return data, labels

if __name__ == "__main__":
    data, labels = parse_diabetes('../data/diabetes/diabetic_data.csv')
    (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels)
    utils.save_protobuf(train_data, train_labels, "diabetes_train")
    utils.save_protobuf(test_data, test_labels, "diabetes_test")