示例#1
0
def split_all_data():
    # READ DATA
    dataset_dir = os.path.join('..', 'data', DATASET)
    feats_file = os.path.join(dataset_dir, 'feats.17')
    labels_file = os.path.join(dataset_dir, 'time')
    data = util.read_data(feats_file, labels_file)

    # SHUFFLE DATA
    np.random.seed(1000)
    np.random.shuffle(data)

    # BUILD FOLDER STRUCTURE
    dataset_dir = os.path.join(SPLIT_DIR, DATASET)
    try:
        os.makedirs(dataset_dir)
    except OSError:
        print "skipping folder creation"

    # SPLIT TRAIN/TEST AND SAVE
    fold_indices = cross_validation.KFold(data.shape[0], n_folds=10)
    for fold, index in enumerate(fold_indices):
        print index[0].shape
        train_data = data[index[0]]
        test_data = data[index[1]]
        train_data, scaler = util.normalize_train_data(train_data)
        test_data = util.normalize_test_data(test_data, scaler)

        fold_dir = os.path.join(dataset_dir, str(fold))
        try:
            os.makedirs(fold_dir)
        except OSError:
            print "skipping fold dir"
        np.savetxt(os.path.join(fold_dir, 'train'), train_data, fmt="%.5f")
        np.savetxt(os.path.join(fold_dir, 'test'), test_data, fmt="%.5f")
def split_all_data():
    # READ DATA
    dataset_dir = os.path.join('..', 'data', DATASET)
    feats_file = os.path.join(dataset_dir, 'feats.17')
    labels_file = os.path.join(dataset_dir, 'time')
    source_file = os.path.join(dataset_dir, 'source')
    target_file = os.path.join(dataset_dir, 'target')
    pe_file = os.path.join(dataset_dir, 'target_postedited')
    #data = util.read_data(feats_file, labels_file)
    
    feats = np.loadtxt(feats_file, dtype=object)
    labels = np.loadtxt(labels_file, dtype=object, ndmin=2)
    src = np.loadtxt(source_file, dtype=object, delimiter='\t', ndmin=2)
    tgt = np.loadtxt(target_file, dtype=object, delimiter='\t', ndmin=2)
    pe = np.loadtxt(pe_file, dtype=object, delimiter='\t', ndmin=2)

    data = np.concatenate((feats, labels, src, tgt, pe), axis=1)
    

    # SHUFFLE DATA
    np.random.seed(1000)
    np.random.shuffle(data)

    # BUILD FOLDER STRUCTURE
    dataset_dir = os.path.join(SPLIT_DIR, DATASET)
    try:
        os.makedirs(dataset_dir)
    except OSError:
        print "skipping folder creation"

    # SPLIT TRAIN/TEST AND SAVE
    fold_indices = cross_validation.KFold(data.shape[0], n_folds=10)
    for fold, index in enumerate(fold_indices):
        print index[0].shape
        train = data[index[0]]
        test = data[index[1]]
        train_data = np.array(train[:, :18], dtype=float)
        test_data = np.array(test[:, :18], dtype=float)
        train_data, scaler = util.normalize_train_data(train_data)
        test_data = util.normalize_test_data(test_data, scaler)

        fold_dir = os.path.join(dataset_dir, str(fold))
        try:
            os.makedirs(fold_dir)
        except OSError:
            print "skipping fold dir"
        np.savetxt(os.path.join(fold_dir, 'train'), train_data, fmt="%.5f")
        np.savetxt(os.path.join(fold_dir, 'test'), test_data, fmt="%.5f")
        np.savetxt(os.path.join(fold_dir, 'train_src'), train[:, 18], fmt="%s")
        np.savetxt(os.path.join(fold_dir, 'test_src'), test[:, 18], fmt="%s")
        np.savetxt(os.path.join(fold_dir, 'train_tgt'), train[:, 19], fmt="%s")
        np.savetxt(os.path.join(fold_dir, 'test_tgt'), test[:, 19], fmt="%s")
        np.savetxt(os.path.join(fold_dir, 'train_pe'), train[:, 20], fmt="%s")
        np.savetxt(os.path.join(fold_dir, 'test_pe'), test[:, 20], fmt="%s")