labels = [] unencoded_data = [] for line in text.splitlines(): splitted = line.split(",") assert(len(splitted) == 35) missing_val = False for attr in splitted: if attr == '?': missing_val = True break if missing_val: continue labels.append(int(splitted[34])) unencoded_data.append(splitted[:34]) data = enc.fit_transform(unencoded_data).toarray() print(data.shape) return (data, np.array(labels)) if __name__ == "__main__": (data, labels) = parse_dermatology("../data/dermatology/dermatology.data") print(data[:10]) print(labels[:10]) (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels) utils.save_protobuf(train_data, train_labels, "dermatology_train") utils.save_protobuf(test_data, test_labels, "dermatology_test")
dst_host_serror_rate: continuous. dst_host_srv_serror_rate: continuous. dst_host_rerror_rate: continuous. dst_host_srv_rerror_rate: continuous. """ def parse_kddcup(fp): all_cols = [] numeric_cols = [] nominal_cols = [] label_col = ['label'] for line in raw_name.splitlines()[1:]: col = line.split(":")[0] col_type = line.split(":")[1][1:-1] if (col_type == 'continuous'): numeric_cols.append(col) elif (col_type == 'symbolic'): nominal_cols.append(col) else: assert(False) all_cols.append(col) df = pandas.read_csv(fp, names=all_cols+label_col) return utils.parse_data_with_pandas(df, [], numeric_cols, label_col, nominal_cols) if __name__ == "__main__": data, labels = parse_kddcup("../data/kddcup/kddcup.data_10_percent") (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels, 0.1) utils.save_protobuf(train_data, train_labels, "kddcup_train") utils.save_protobuf(test_data, test_labels, "kddcup_test")
num_correct += torch.sum(torch.eq(preds, genders_batch)) total += preds.size(0) epoch_end = time.time() print("Epoch %d took %s" % (epoch, utils.sec2str(int(epoch_end - epoch_start)))) print("acc: ", float(num_correct) / float(total)) end = time.time() print(utils.sec2str(int(end - start))) if __name__ == "__main__": device = "cuda:0" with torch.no_grad(): # vgg16 = load_face_model("../../caffemodel2pytorch/gender.caffemodel.pt").to(device) vgg16 = torchvision.models.vgg16(pretrained=True) vgg16 = expose_last_fc(vgg16).to(device) # evaluate_model(128, 10000, device) data, labels = get_features(128, 1024, vgg16, device) # normalize each feature to be [0,1) data = data - data.min(dim=0)[0] data = data / (data.max(dim=0)[0] + 1e-6) data = data.to('cpu').numpy() labels = labels.to('cpu').numpy() (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels, 0.1) utils.save_protobuf(train_data, train_labels, "imdb_train") utils.save_protobuf(test_data, test_labels, "imdb_test")
'priority': 3, 'spec_prior': 4, } def parse_nursery(fp): with open(fp) as f: text = f.read() enc = OneHotEncoder(handle_unknown='error') unencoded_data = [] labels = [] for line in text.splitlines(): splitted = line.split(",") assert(len(splitted) == 9) unencoded_data.append(splitted[:-1]) labels.append(label_map[splitted[8]]) data = enc.fit_transform(unencoded_data).toarray() assert(data.shape == (len(labels), 3+5+4+4+3+2+3+3)) return (data, np.array(labels)) if __name__ == '__main__': (data, labels) = parse_nursery('../data/nursery/nursery.data') print(data[:10]) print(labels[:10]) (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels) utils.save_protobuf(train_data, train_labels, "nursery_train") utils.save_protobuf(test_data, test_labels, "nursery_test")
import pandas import utils import numpy as np def parse_creditcard(fp): df = pandas.read_excel(fp, sheet_name='Data') df = df[1:] drop_cols = [] numeric_cols = ["X%d" % i for i in range(1, 24)] label_col = ["Y"] return utils.parse_data_with_pandas(df, drop_cols, numeric_cols, label_col, []) if __name__ == "__main__": data, labels = parse_creditcard('../data/creditcard/creditcard.xls') (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels) utils.save_protobuf(train_data, train_labels, "creditcard_train") utils.save_protobuf(test_data, test_labels, "creditcard_test")
import utils import numpy as np import struct def parse_images(fp): with open(fp, "rb") as f: magic, size = struct.unpack(">II", f.read(8)) nrows, ncols = struct.unpack(">II", f.read(8)) data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder(">")) data = data.reshape((size, nrows * ncols)).astype(float) return data def parse_labels(fp): with open(fp, "rb") as f: magic, size = struct.unpack(">II", f.read(8)) data = np.fromfile(f, dtype=np.dtype( np.uint8).newbyteorder(">")).astype(int) return data if __name__ == "__main__": train_data = parse_images("../raw_data/mnist/mnist500k-patterns") train_labels = parse_labels("../raw_data/mnist/mnist500k-labels") utils.save_protobuf(train_data, train_labels, "mnist500k_train") # test_data = parse_images("../data/mnist/test10k-patterns") # test_labels = parse_labels("../data/mnist/test10k-labels") # utils.save_protobuf(test_data, test_labels, "mnist60k_test")
df = df.drop(drop_cols, axis=1) numeric_df = df[numeric_cols] label_df = df[label_col] nominal_df = df.drop(numeric_cols + label_col, axis=1) numeric = np.array(numeric_df.as_matrix()) nominal = OneHotEncoder().fit_transform(nominal_df.as_matrix()).toarray() labels = np.array(LabelEncoder().fit_transform(label_df.as_matrix().ravel())) # first 8 columns are numeric, and the rest are nominal for total of 155 columns # 0: 1-14 # 1: 0-6 # 2: 1-132 # 3: 1-81 # 4: 0-42 # 5: 0-76 # 6: 0-21 # 7: 1-16 data = np.concatenate((numeric, nominal), axis=1) print("data shape: ", data.shape) print("labels shape: ", labels.shape) return data, labels if __name__ == "__main__": data, labels = parse_diabetes('../data/diabetes/diabetic_data.csv') (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels) utils.save_protobuf(train_data, train_labels, "diabetes_train") utils.save_protobuf(test_data, test_labels, "diabetes_test")