示例#1
0
def wd_train_get_batch(title_len=30, batch_size=128):
    print('loading word train_title and train_content, this should cost minutes, please wait.')
    train_title = np.load('../data/wd_train_title.npy')
    train_content = np.load('../data/wd_train_content.npy')
    p = Pool(6)
    X_title = np.asarray(list(p.map(pad_X30, train_title)))
    X_content = np.asarray(list(p.map(wd_pad_cut_docs, train_content)))
    p.close()
    p.join()
    X_content.shape = [-1, 30*10]
    X = np.hstack([X_title, X_content])
    y = np.load('../data/y_tr.npy')
    # 划分验证集
    sample_num = X.shape[0]
    np.random.seed(13)
    valid_num = 100000
    new_index = np.random.permutation(sample_num)
    X = X[new_index]
    y = y[new_index]
    X_valid = X[:valid_num]
    y_valid = y[:valid_num]
    X_train = X[valid_num:]
    y_train = y[valid_num:]
    print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
    print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
    # 验证集打 batch
    print('creating batch data.')
    sample_num = len(X_valid)
    print('valid_sample_num=%d' % sample_num)
    train_batch(X_valid, y_valid, wd_valid_path, batch_size)
    # 训练集打 batch
    sample_num = len(X_train)
    print('train_sample_num=%d' % sample_num)
    train_batch(X_train, y_train, wd_train_path, batch_size)
def ch_train_get_batch(title_len=52, content_len=300, batch_size=128):
    print('loading char train_title and train_content.')
    train_title = np.load('../data/ch_train_title.npy')
    train_content = np.load('../data/ch_train_content.npy')
    p = Pool()
    X_title = np.asarray(p.map(pad_X52, train_title))
    X_content = np.asarray(p.map(pad_X300, train_content))
    p.close()
    p.join()
    X = np.hstack([X_title, X_content])
    y = np.load('../data/y_tr.npy')
    # 划分验证集
    sample_num = X.shape[0]
    np.random.seed(13)
    valid_num = 100000
    new_index = np.random.permutation(sample_num)
    X = X[new_index]
    y = y[new_index]
    X_valid = X[:valid_num]
    y_valid = y[:valid_num]
    X_train = X[valid_num:]
    y_train = y[valid_num:]
    print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
    print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
    # 验证集打batch
    print('creating batch data.')
    sample_num = len(X_valid)
    print('valid_sample_num=%d' % sample_num)
    train_batch(X_valid, y_valid, ch_valid_path, batch_size)
    # 训练集打batch
    sample_num = len(X_train)
    print('train_sample_num=%d' % sample_num)
    train_batch(X_train, y_train, ch_train_path, batch_size)
def ch_train_get_batch(title_len=52, content_len=300, batch_size=128):
    print('loading char train_title and train_content.')
    train_title = np.load('../data/ch_train_title.npy')
    train_content = np.load('../data/ch_train_content.npy')
    p = Pool()
    X_title = np.asarray(p.map(pad_X52, train_title))
    X_content = np.asarray(p.map(pad_X300, train_content))
    p.close()
    p.join()
    X = np.hstack([X_title, X_content])
    y = np.load('../data/y_tr.npy')
    # 划分验证集
    sample_num = X.shape[0]
    np.random.seed(13)
    valid_num = 100000
    new_index = np.random.permutation(sample_num)
    X = X[new_index]
    y = y[new_index]
    X_valid = X[:valid_num]
    y_valid = y[:valid_num]
    X_train = X[valid_num:]
    y_train = y[valid_num:]
    print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
    print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
    # 验证集打batch
    print('creating batch data.')
    sample_num = len(X_valid)
    print('valid_sample_num=%d' % sample_num)
    train_batch(X_valid, y_valid, ch_valid_path, batch_size)
    # 训练集打batch
    sample_num = len(X_train)
    print('train_sample_num=%d' % sample_num)
    train_batch(X_train, y_train, ch_train_path, batch_size)
def wd_train_get_batch(title_len=30, batch_size=128):
    print('loading word train_title and train_content, this should cost minutes, please wait.')
    train_title = np.load('../data/wd_train_title.npy')
    train_content = np.load('../data/wd_train_content.npy')
    p = Pool(6)
    X_title = np.asarray(p.map(pad_X30, train_title))
    X_content = np.asarray(p.map(wd_pad_cut_docs, train_content))
    p.close()
    p.join()
    X_content.shape = [-1, 30*10]
    X = np.hstack([X_title, X_content])
    y = np.load('../data/y_tr.npy')
    # 划分验证集
    sample_num = X.shape[0]
    np.random.seed(13)
    valid_num = 100000
    new_index = np.random.permutation(sample_num)
    X = X[new_index]
    y = y[new_index]
    X_valid = X[:valid_num]
    y_valid = y[:valid_num]
    X_train = X[valid_num:]
    y_train = y[valid_num:]
    print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
    print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
    # 验证集打 batch
    print('creating batch data.')
    sample_num = len(X_valid)
    print('valid_sample_num=%d' % sample_num)
    train_batch(X_valid, y_valid, wd_valid_path, batch_size)
    # 训练集打 batch
    sample_num = len(X_train)
    print('train_sample_num=%d' % sample_num)
    train_batch(X_train, y_train, wd_train_path, batch_size)
def ch_train_get_batch(title_len=52, content_len=300, batch_size=128):
    print('loading char train_title and train_content.')
    train_title = np.load('../data/ch_train_title.npy')
    train_content = np.load('../data/ch_train_content.npy')
    print('data loaded, start to pad_X52,X300')
    p = Pool()
    X_title = np.asarray(p.map(pad_X52, train_title))
    X_content = np.asarray(p.map(pad_X300, train_content))
    p.close()
    p.join()
    print('Pool finished!')
    X = np.hstack([X_title, X_content])
    del X_title, X_content
    gc.collect()
    print('del X_title, X_content')
    y = np.load('../data/y_tr.npy')
    print('y label loaded ...')
    # 划分验证集
    sample_num = X.shape[0]
    np.random.seed(13)
    valid_num = 100000
    new_index = np.random.permutation(sample_num)
    X = X[new_index]  #运行到这产生 MemoryError
    y = y[new_index]
    X_valid = X[:valid_num]
    y_valid = y[:valid_num]
    X_train = X[valid_num:]
    y_train = y[valid_num:]
    print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
    print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
    # 验证集打batch
    print('creating batch data.')
    sample_num = len(X_valid)
    print('valid_sample_num=%d' % sample_num)
    train_batch(X_valid, y_valid, ch_valid_path, batch_size)
    print('release space, deleting X_valid, y_valid')
    del X_valid, y_valid
    gc.collect()
    # 训练集打batch
    sample_num = len(X_train)
    print('train_sample_num=%d' % sample_num)
    train_batch(X_train, y_train, ch_train_path, batch_size)
    print('release space, deleting X_train, y_train')
    del X_train, y_train
    gc.collect()
def wd_train_get_batch(title_len=30, content_len=150, batch_size=128):
    print('loading word train title and content...')
    train_title = np.load('../data/wd_train_title.npy')
    train_content = np.load('../data/wd_train_content.npy')
    p = Pool()
    title = np.asarray(p.map(pad_X30, train_title))
    content = np.asarray(p.map(pad_X150, train_content))
    p.close()
    p.join()
    X = np.hstack([title, content])
    print('getting labels, this should cost several minutes, please wait...')

    y = get_labels()
    print('y.shape=', y.shape)
    np.save('../data/y_tr.npy', y)

    sample_num = X.shape[0]
    np.random.seed(13)
    valid_num = 10000
    new_index = np.random.permutation(sample_num)
    X = X[new_index]
    y = y[new_index]
    X_valid = X[:valid_num]
    y_valid = X[:valid_num]
    X_train = X[valid_num:]
    y_train = X[valid_num:]
    print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
    print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
    print('creating batch data.')
    #验证集打batch
    sample_num = len(X_valid)
    print('valid_sample_num=%d' % sample_num)
    train_batch(X_valid, y_valid, wd_valid_path, batch_size)
    print('release space, deleting X_valid, y_valid')
    del X_valid, y_valid
    gc.collect()
    # 训练集打batch
    sample_num = len(X_train)
    print('train_sample_num=%d' % sample_num)
    train_batch(X_train, y_train, wd_train_path, batch_size)
    print('release space, deleting X_train, y_train')
    del X_train, y_train
    gc.collect()
def ch_train_get_batch(title_len=30, content_len=150, batch_size=128):
    print('loading char train_title and train_content.')
    train_title = np.load('../data/ch_train_title.npy')
    train_content = np.load('../data/ch_train_content.npy')
    p = Pool()
    #补全和截断
    X_title = np.asarray(p.map(pad_X30, train_title))
    X_content = np.asarray(p.map(pad_X150, train_content))
    p.close()
    p.join()
    X = np.hstack([X_title, X_content])
    print('getting labels, this should cost minutes, please wait.')
    y = get_lables()
    print('y.shape=', y.shape)
    np.save('../data/y_tr.npy', y)
    #y = np.load('../data/y_tr.npy')
    # 划分验证集
    sample_num = X.shape[0]
    print(sample_num)
    np.random.seed(13)
    valid_num = 10000
    new_index = np.random.permutation(sample_num)
    X = X[new_index]
    y = y[new_index]
    X_valid = X[:valid_num]
    y_valid = y[:valid_num]
    X_train = X[valid_num:]
    y_train = y[valid_num:]
    print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
    print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
    # 验证集打batch
    print('creating batch data.')
    sample_num = len(X_valid)
    print('valid_sample_num=%d' % sample_num)
    train_batch(X_valid, y_valid, ch_valid_path, batch_size)
    # 训练集打batch
    sample_num = len(X_train)
    print('train_sample_num=%d' % sample_num)
    train_batch(X_train, y_train, ch_train_path, batch_size)
def wd_train_get_batch(title_len=30, content_len=200, batch_size=128):
    print('loading word train_title and train_content.')
    train_title = np.load('../data_new/wd_train_title.npy')
    train_content = np.load('../data_new/wd_train_content.npy')

    y = np.load('../data_new/y_tr.npy')
    print('y.shape=', y.shape)

    print(" raw titles:", train_title[0], 'title.shape:', train_title.shape)
    print(" raw contents:", train_content[0], 'contents.shape:',
          train_content.shape)
    print("y:", y[0], 'y.shape:', y.shape)

    #补全和截断
    p = Pool()
    X_title = np.asarray(p.map(pad_X30, train_title))
    X_content = np.asarray(p.map(pad_X200, train_content))
    p.close()
    p.join()

    print("padding 20 X_title:", X_title[0], "shape:", X_title.shape)
    print("padding 100 X_contents:", X_content[0], "shape:", X_content.shape)

    #拼接
    X = np.hstack([X_title, X_content])
    sample_num = len(X)
    print('sample_num=%d' % sample_num)
    new_index = np.random.permutation(sample_num)
    X = X[new_index]
    y = y[new_index]
    print("X_train.shape:", X.shape, 'y_train.shape=', y.shape)

    print('creating batch data.')
    # 打batch

    train_batch(X, y, wd_train_path, batch_size)
示例#9
0
def jieba_train_get_batch(batch_size=config.BATCH_SIZE):
    batch_path = jieba_train_path + 'batch/'
    print('loading word train_title and train_content.')
    X_train = np.load(jieba_train_path + 'train_data.npy')
    # 训练集打batch
    y_train_law = np.load(jieba_train_path + 'train_law_label.npy')
    sample_num = len(X_train)
    print('train_sample_num=%d' % sample_num)
    sample_num = len(y_train_law)
    print('train_sample_num_law=%d' % sample_num)
    train_batch(X_train, y_train_law, batch_path + 'law/', batch_size)
    del y_train_law

    y_train_accu = np.load(jieba_train_path + 'train_accu_label.npy')
    sample_num = len(y_train_accu)
    print('train_sample_num_accu=%d' % sample_num)
    train_batch(X_train, y_train_accu, batch_path + 'accu/', batch_size)
    del y_train_accu

    y_train_time = np.load(jieba_train_path + 'train_time_label.npy')
    sample_num = len(y_train_time)
    print('train_sample_num_time=%d' % sample_num)
    train_batch(X_train, y_train_time, batch_path + 'time/', batch_size)
    del y_train_time

    y_train_timelog = np.load(jieba_train_path + 'train_time_labellog.npy')
    sample_num = len(y_train_timelog)
    print('train_sample_num_timelog=%d' % sample_num)
    train_batch(X_train, y_train_timelog, batch_path + 'timelog/', batch_size)
    del y_train_timelog, X_train

    batch_path = jieba_valid_path + 'batch/'
    X_valid = np.load(jieba_valid_path + 'valid_data.npy')
    # 验证集打batch
    sample_num = len(X_valid)
    print('valid_sample_num=%d' % sample_num)
    y_valid_law = np.load(jieba_valid_path + 'valid_law_label.npy')
    sample_num = len(y_valid_law)
    print('valid_sample_num_law=%d' % sample_num)
    train_batch(X_valid, y_valid_law, batch_path + 'law/', batch_size)
    del y_valid_law

    y_valid_accu = np.load(jieba_valid_path + 'valid_accu_label.npy')
    sample_num = len(y_valid_accu)
    print('valid_sample_num_accu=%d' % sample_num)
    train_batch(X_valid, y_valid_accu, batch_path + 'accu/', batch_size)
    del y_valid_accu

    y_valid_time = np.load(jieba_valid_path + 'valid_time_label.npy')
    sample_num = len(y_valid_time)
    print('valid_sample_num_time=%d' % sample_num)
    train_batch(X_valid, y_valid_time, batch_path + 'time/', batch_size)
    del y_valid_time

    y_valid_timelog = np.load(jieba_valid_path + 'valid_time_labellog.npy')
    sample_num = len(y_valid_timelog)
    print('valid_sample_num_timelog=%d' % sample_num)
    train_batch(X_valid, y_valid_timelog, batch_path + 'timelog/', batch_size)
    del y_valid_timelog, X_valid