def vectorize_data(filenames, maxlen=100, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="boundary", return_chars=False): assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data" X = [] X_char = [] Y = [] for i, filename in enumerate(filenames): for docid, doc in pp.get_documents(filename): for seq in pp.get_sequences(doc): x = [] x_char = [] y = [] for token in seq: x.append(1 + token.word_index) # Add 1 to include token for padding if return_chars: x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding if output_type == "category": y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding else: y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding y.append(y_idx) # Add 1 to include token for padding X.append(x) if return_chars: padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\ pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist() X_char.append(padded_sequence) Y.append(y) X = pad_sequences(X, maxlen=maxlen) Y = pad_sequences(Y, maxlen=maxlen) X = np.array(X) Y = vtu.to_onehot(Y, output_label_size) if return_chars: return X, Y, np.array(X_char) return X, Y
def vectorize_data(filenames, maxlen=2000, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="hybrid", return_chars=False): """ Using histogram of document lengths 2000 is a reasonable number train on. """ assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data" X = [] X_char = [] Y = [] for i, filename in enumerate(filenames): for docid, doc in pp.get_documents(filename): seq = pp.get_sequences(doc) x = [] x_char = [] y = [] for token in seq: x.append( 1 + token.word_index) # Add 1 to include token for padding if return_chars: x_char.append( (1 + np.array(token.char_seq) ).tolist()) # Add 1 to include token for padding if output_type == "hybrid": y_idx = 1 + output_label_dict.get( "%s-%s" % (token.b_label, token.c_label), -1) # Add 1 to include token for padding elif output_type == "category": y_idx = 1 + output_label_dict.get( token.c_label, -1) # Add 1 to include token for padding else: y_idx = 1 + output_label_dict.get( token.b_label, -1) # Add 1 to include token for padding y.append(y_idx) # Add 1 to include token for padding X.append(x) if return_chars: padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\ pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist() X_char.append(padded_sequence) Y.append(y) X = pad_sequences(X, maxlen=maxlen) Y = pad_sequences(Y, maxlen=maxlen) X = np.array(X) Y = vtu.to_onehot(Y, output_label_size) if return_chars: return X, Y, np.array(X_char) return X, Y
def vectorize_data_old(filenames, maxlen=100, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="boundary", return_chars=False): assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data" X = [] X_char = [] Y = [] for i, filename in enumerate(filenames): for docid, doc in pp_old.get_documents(filename): for seq in pp_old.get_sequences(doc): x = [] x_char = [] y = [] for token in seq: x.append( 1 + token.word_index) # Add 1 to include token for padding if return_chars: x_char.append( (1 + np.array(token.char_seq) ).tolist()) # Add 1 to include token for padding if output_type == "category": y_idx = 1 + output_label_dict.get( token.c_label, -1) # Add 1 to include token for padding else: y_idx = 1 + output_label_dict.get( token.b_label, -1) # Add 1 to include token for padding y.append(y_idx) # Add 1 to include token for padding X.append(x) if return_chars: padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\ pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist() X_char.append(padded_sequence) Y.append(y) X = pad_sequences(X, maxlen=maxlen) Y = pad_sequences(Y, maxlen=maxlen) X = np.array(X) Y = vtu.to_onehot(Y, output_label_size) if return_chars: return X, Y, np.array(X_char) return X, Y
for k in CONFIG["data_vectors"][::3] ] X_char_train, X_char_test = [ np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][1::3] ] logger.info("Loaded X_char_train: %s, X_char_test: %s" % (X_char_train.shape, X_char_test.shape)) X_char_train = X_char_train.reshape( (X_char_train.shape[0], X_char_train.shape[1] * X_char_train.shape[2])) X_char_test = X_char_test.reshape( (X_char_test.shape[0], X_char_test.shape[1] * X_char_test.shape[2])) Y_train = [ vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][2:3], [hybrid_size]) ] Y_test = [ vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][5:6], [hybrid_size]) ] if model_type == "brnn_cnn_multitask": logger.info( "Loaded data shapes:\nX_train: %s, X_char_train: %s, Y_train: %s\nX_test: %s, X_char_test: %s, Y_test: %s" % (X_train.shape, X_char_train.shape, [k.shape for k in Y_train], X_test.shape, X_char_test.shape, [k.shape for k in Y_train])) if model_type == "brnn_cnn_multitask": model, output_names, _temp_models = gen_model_brnn_cnn_multitask( vocab_size=vocab_size,
np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][3]), X_test) np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][4]), vtu.onehot_to_idxarr(Y_test[0])) np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][5]), vtu.onehot_to_idxarr(Y_test[1])) else: X_train, Y_train = vectorize_data(train_files, maxlen=maxlen, output_label_size=labels_size, output_label_dict=labels_dict, output_type=label_type) X_test, Y_test = vectorize_data(test_files, maxlen=maxlen, output_label_size=labels_size, output_label_dict=labels_dict, output_type=label_type) logger.info("Saving preprocessed vectors for faster computation next time in %s files." % ["%s/%s" % (BASE_DATA_DIR, k) for k in CONFIG["data_vectors"]]) np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][0]), X_train) np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][1]), vtu.onehot_to_idxarr(Y_train)) np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][2]), X_test) np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][3]), vtu.onehot_to_idxarr(Y_test)) else: logger.info("Preprocessed vectors exist. Loading from files %s." % ["%s/%s" % (BASE_DATA_DIR, k) for k in CONFIG["data_vectors"]]) if model_type == "brnn_multitask": X_train, X_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][::3]] Y_train = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][1:3], [boundary_size, category_size])] Y_test = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][4:6], [boundary_size, category_size])] elif model_type == "brnn_cnn_multitask": X_train, X_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][::4]] X_char_train, X_char_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][1::4]] logger.info("Loaded X_char_train: %s, X_char_test: %s" % (X_char_train.shape, X_char_test.shape)) X_char_train = X_char_train.reshape((X_char_train.shape[0], X_char_train.shape[1]*X_char_train.shape[2])) X_char_test = X_char_test.reshape((X_char_test.shape[0], X_char_test.shape[1]*X_char_test.shape[2])) Y_train = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][2:4], [boundary_size, category_size])] Y_test = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][6:8], [boundary_size, category_size])] else: X_train, X_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][::2]] Y_train, Y_test = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k)), labels_size) for k in CONFIG["data_vectors"][1::2]] if model_type == "brnn_multitask": logger.info("Loaded data shapes:\nX_train: %s, Y_train: %s\nX_test: %s, Y_test: %s" % (X_train.shape, [k.shape for k in Y_train], X_test.shape, [k.shape for k in Y_train])) elif model_type == "brnn_cnn_multitask":
np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][3]), X_test) np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][4]), X_char_test) np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][5]), vtu.onehot_to_idxarr(Y_test[0])) # Reshape arrays after saving X_char_train = X_char_train.reshape((X_char_train.shape[0], X_char_train.shape[1]*X_char_train.shape[2])) X_char_test = X_char_test.reshape((X_char_test.shape[0], X_char_test.shape[1]*X_char_test.shape[2])) logger.info("Loaded X_char_train: %s, X_char_test: %s" % (X_char_train.shape, X_char_test.shape)) else: logger.info("Preprocessed vectors exist. Loading from files %s." % ["%s/%s" % (BASE_DATA_DIR, k) for k in CONFIG["data_vectors"]]) if model_type == "brnn_cnn_multitask": X_train, X_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][::3]] X_char_train, X_char_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][1::3]] logger.info("Loaded X_char_train: %s, X_char_test: %s" % (X_char_train.shape, X_char_test.shape)) X_char_train = X_char_train.reshape((X_char_train.shape[0], X_char_train.shape[1]*X_char_train.shape[2])) X_char_test = X_char_test.reshape((X_char_test.shape[0], X_char_test.shape[1]*X_char_test.shape[2])) Y_train = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][2:3], [hybrid_size])] Y_test = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][5:6], [hybrid_size])] if model_type == "brnn_cnn_multitask": logger.info("Loaded data shapes:\nX_train: %s, X_char_train: %s, Y_train: %s\nX_test: %s, X_char_test: %s, Y_test: %s" % (X_train.shape, X_char_train.shape, [k.shape for k in Y_train], X_test.shape, X_char_test.shape, [k.shape for k in Y_train])) if model_type == "brnn_cnn_multitask": model, output_names, _temp_models = gen_model_brnn_cnn_multitask(vocab_size=vocab_size, char_vocab_size = char_vocab_size, embedding_size=embedding_size, char_embedding_size = char_embedding_size, nb_filters = nb_filters, maxlen=maxlen, max_charlen=max_charlen, output_size=[hybrid_size], hidden_layer_size=hidden_layer_size, num_hidden_layers = num_hidden_layers, RNN_LAYER_TYPE=RNN_LAYER_TYPE) logger.error("Feature under development.") if weights_file is not None: logger.info("Loading model weights from %s. Will continue training model from %s epochs." % (weights_file, base_epochs)) model.load_weights(weights_file) for epoch in xrange(base_epochs, n_epochs, save_every): logger.info("Starting Epochs %s to %s" % (epoch, epoch + save_every)) start_time = time.time() if model_type == "brnn_cnn_multitask": model.fit({"input1": X_train, "input2": X_char_train, output_names[0]: Y_train[0]},\
"Saving preprocessed vectors for faster computation next time in %s files." % ["%s/%s" % (BASE_DATA_DIR, k) for k in CONFIG["data_vectors"]] ) np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][0]), X_train) np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][1]), vtu.onehot_to_idxarr(Y_train)) np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][2]), X_test) np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][3]), vtu.onehot_to_idxarr(Y_test)) else: logger.info( "Preprocessed vectors exist. Loading from files %s." % ["%s/%s" % (BASE_DATA_DIR, k) for k in CONFIG["data_vectors"]] ) if model_type == "brnn_multitask": X_train, X_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][::3]] Y_train = [ vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][1:3], [boundary_size, category_size]) ] Y_test = [ vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][4:6], [boundary_size, category_size]) ] elif model_type == "brnn_cnn_multitask": X_train, X_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][::4]] X_char_train, X_char_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][1::4]] logger.info("Loaded X_char_train: %s, X_char_test: %s" % (X_char_train.shape, X_char_test.shape)) X_char_train = X_char_train.reshape((X_char_train.shape[0], X_char_train.shape[1] * X_char_train.shape[2])) X_char_test = X_char_test.reshape((X_char_test.shape[0], X_char_test.shape[1] * X_char_test.shape[2])) Y_train = [ vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][2:4], [boundary_size, category_size])