def split_train_test(train_dir, vocab_dir, test_size, n_labeled, wordslength): #train_dir = './data/labeled1.txt' #vocab_dir = './data/vocab_yinan_test_rnn4.txt' if not os.path.exists(vocab_dir): build_vocab(train_dir, vocab_dir, 1000) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) x, y = process_file(train_dir, word_to_id, cat_to_id, wordslength) # x_rnn, y_rnn = process_file_rnn(train_dir, word_to_id, cat_to_id, 600) listy = [] for i in range(np.shape(y)[0]): for j in range(np.shape(y)[1]): if y[i][j] == 1: listy.append(j) listy = np.array(listy) X_train, X_test, y_train, y_test = \ train_test_split(x, listy, test_size=test_size) # X_train = X_train[:(n_labeled + 24)] trn_ds = Dataset( X_train, np.concatenate( [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)])) # trn_ds = Dataset(X_train, np.concatenate( # [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)])) fully_tst_ds = Dataset(X_test, y_test) X_val, X_real_test, y_val, y_real_test = \ train_test_split(X_test, y_test, test_size=0.5) tst_ds = Dataset(X_real_test, y_real_test) val_ds = Dataset(X_val, y_val) fully_labeled_trn_ds = Dataset(X_train, y_train) # print (fully_labeled_trn_ds.get_entries()[0]) return trn_ds, tst_ds, y_train, fully_labeled_trn_ds, fully_tst_ds, val_ds
def split_train_test_rnn(train_dir, vocab_dir, vocab_size, test_size, val_size, n_labeled, wordslength, categories_class): if not os.path.exists(vocab_dir): build_vocab(train_dir, vocab_dir, vocab_size) categories, cat_to_id = read_category(categories_class) words, word_to_id = read_vocab(vocab_dir) data_id, label_id = process_file_rnn(train_dir, word_to_id, cat_to_id, wordslength) # x_rnn, y_rnn = process_file_rnn(train_dir, word_to_id, cat_to_id, 600) y = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id)) listy = [] for i in range(np.shape(y)[0]): for j in range(np.shape(y)[1]): if y[i][j] == 1: listy.append(j) listy = np.array(listy) X_train, X_test, y_train, y_test = \ train_test_split(data_id, listy, test_size=test_size) X_train_al = [] X_test_al = [] res = [] for i in X_train: for j in range(wordslength): a = i.count(j) if a > 0: res.append(a) else: res.append(0) X_train_al.append(res) res = [] for i in X_test: for j in range(wordslength): a = i.count(j) if a > 0: res.append(a) else: res.append(0) X_test_al.append(res) res = [] X_train_al = np.array(X_train_al) X_test_al = np.array(X_test_al) trn_ds_al = Dataset( X_train_al, np.concatenate( [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)])) tst_ds_al = Dataset(X_test_al, y_test) X_train_rnn = kr.preprocessing.sequence.pad_sequences(X_train, wordslength) X_test_rnn = kr.preprocessing.sequence.pad_sequences(X_test, wordslength) X_train_rnn, X_val_rnn, y_train_rnn, y_val_rnn = \ train_test_split(X_train_rnn, y_train, test_size=val_size) trn_ds_rnn = Dataset( X_train_rnn, np.concatenate( [y_train_rnn[:n_labeled], [None] * (len(y_train_rnn) - n_labeled)])) val_ds_rnn = Dataset(X_val_rnn, y_val_rnn) tst_ds_rnn = Dataset(X_test_rnn, y_test) fully_labeled_trn_ds_al = Dataset(X_train_al, y_train) fully_labeled_trn_ds_rnn = Dataset(X_train_rnn, y_train_rnn) return trn_ds_al, tst_ds_al, y_train_rnn, fully_labeled_trn_ds_al, \ trn_ds_rnn, tst_ds_rnn, fully_labeled_trn_ds_rnn, val_ds_rnn