def __init__(self, data, model, sess): """ :param data: :param model: expect an initialized and trained model :return: """ roots_size = [tree_util.size_of_tree(root) for root in data] data = helper.sort_by(data, roots_size) probs, labels = model.predict_and_label(data, sess) labels = get_prediction(labels) predictions = get_prediction(probs) self.acc = get_accuracy(labels, predictions) if len(data) < 1500: print(model.accuracy(data, sess)) self.TP, self.FP, self.TN, self.FN = get_confusion_matrix(labels, predictions) self.precision = self.TP / (self.TP + self.FP) self.recall = self.TP / (self.TP + self.FN) self.F1 = 2 * (self.precision * self.recall) / (self.precision + self.recall) self.TPR_list, self.FPR_list = get_roc_values(labels, probs) self.auc = metrics.auc(self.FPR_list, self.TPR_list)
def build_feed_dict(self, roots, sort=True, train=False): if sort: roots_size = [tree_util.size_of_tree(root) for root in roots] roots = helper.sort_by(roots, roots_size) roots_size = [tree_util.size_of_tree(root) for root in roots] roots_list, permutation = helper.greedy_bin_packing(roots, roots_size, np.max(roots_size)) node_list_list = [] root_indices = [] internal_nodes_array = [] lstm_prev_list = [] for i, roots in enumerate(roots_list): node_list = [] root_index = 0 leaf_index = 0 lstm_prev = [0] lstm_prev_count = 0 for root in roots: tree_util.depth_first_traverse(root, node_list, lambda node, node_list: node_list.append(node)) leaf_count = tree_util.leafs_in_tree(root) root_index += leaf_count root_indices.append([i, root_index]) for j in range(0,leaf_count): leaf_index += 1 internal_nodes_array.append([i, leaf_index]) leaf_count = tree_util.leafs_in_tree(root) for x in range(0, leaf_count): if x == 0: lstm_prev.append(0) else: lstm_prev.append(lstm_prev_count) lstm_prev_count += 1 node_list_list.append(node_list) lstm_prev_list.append(lstm_prev) feed_dict = { self.dropout_rate: FLAGS.dropout_prob if train else 0, self.lstm_prev_array: helper.lists_pad(lstm_prev_list, 0), self.leaf_word_array: helper.lists_pad( [[0] + [self.word_embed.get_idx(node.value) for node in node_list if node.is_leaf] for node_list in node_list_list] ,0), self.loss_array: root_indices if self.use_root_loss else internal_nodes_array, self.root_array: root_indices, self.label_array: helper.lists_pad([ [[0, 0]] + [node.label for node in node_list if node.is_leaf] for node_list in node_list_list], [0, 0]) } return feed_dict, permutation
from utils import data_util, tree_util, helper _data_util = data_util.DataUtil() data = _data_util.get_data() roots_size = [tree_util.size_of_tree(root) for root in data.train_trees] roots = helper.sort_by(data.train_trees, roots_size) for root in roots[-5:]: print(root.label) print(root.to_sentence()) print()
def build_feed_dict(self, roots, sort=True, train=False): if sort: roots_size = [tree_util.size_of_tree(root) for root in roots] roots = helper.sort_by(roots, roots_size) roots_size = [tree_util.size_of_tree(root) for root in roots] roots_list, permutation = helper.greedy_bin_packing( roots, roots_size, np.max(roots_size)) node_list_list = [] node_to_index_list = [] root_indices = [] lstm_idx_list = [] internal_nodes_array = [] for i, roots in enumerate(roots_list): node_list = [] lstm_idx = [0] root_index = 0 start = 0 for root in roots: tree_util.depth_first_traverse( root, node_list, lambda node, node_list: node_list.append(node)) _, start = tree_util.get_preceding_lstm_index( root, start, start, lstm_idx) root_index += tree_util.size_of_tree(root) root_indices.append([i, root_index]) node_list_list.append(node_list) node_to_index = helper.reverse_dict(node_list) node_to_index_list.append(node_to_index) lstm_idx_list.append(lstm_idx) for node in node_list: if not node.is_leaf: internal_nodes_array.append([i, node_to_index[node] + 1]) internal_nodes_array = internal_nodes_array if len( internal_nodes_array) > 0 else [[0, 0]] feed_dict = { self.dropout_rate: FLAGS.dropout_prob if train else 0, self.leaf_word_array: helper.lists_pad([[0] + [ self.word_embed.get_idx(node.value) for node in node_list if node.is_leaf ] for node_list in node_list_list], 0), self.lstm_index_array: helper.lists_pad(lstm_idx_list, 0), self.loss_array: root_indices if self.use_root_loss else internal_nodes_array, self.root_array: root_indices, self.is_leaf_array: helper.lists_pad( [[0] + helper.to_int([node.is_leaf for node in node_list]) for node_list in node_list_list], 0), self.word_index_array: helper.lists_pad( [[0] + [self.word_embed.get_idx(node.value) for node in node_list] for node_list in node_list_list], self.word_embed.get_idx("ZERO")), self.left_child_array: helper.lists_pad([[0] + helper.add_one([ node_to_index[node.left_child] if node.left_child is not None else -1 for node in node_list ]) for node_list, node_to_index in zip( node_list_list, node_to_index_list)], 0), self.right_child_array: helper.lists_pad([[0] + helper.add_one([ node_to_index[node.right_child] if node.right_child is not None else -1 for node in node_list ]) for node_list, node_to_index in zip( node_list_list, node_to_index_list)], 0), self.label_array: helper.lists_pad([[[0, 0]] + [node.label for node in node_list] for node_list in node_list_list], [0, 0]) } return feed_dict, permutation
def get_data(): if not os.path.exists(directories.CLASSIFIER_DATA_DIR): os.mkdir(directories.CLASSIFIER_DATA_DIR) if not os.path.exists(directories.CLASSIFIER_DATA(FLAGS.model_name)): os.mkdir(directories.CLASSIFIER_DATA(FLAGS.model_name)) if os.path.exists( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_train.npy'): x_train = np.load( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_train.npy') y_train = np.load( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_train.npy') x_val = np.load( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_val.npy') y_val = np.load( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_val.npy') x_test = np.load( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_test.npy') y_test = np.load( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_test.npy') else: _data_util = data_util.DataUtil() data = _data_util.get_data() roots_size = [ tree_util.size_of_tree(root) for root in data.train_trees ] data.train_trees = helper.sort_by(data.train_trees, roots_size) roots_size = [tree_util.size_of_tree(root) for root in data.val_trees] data.val_trees = helper.sort_by(data.val_trees, roots_size) roots_size = [tree_util.size_of_tree(root) for root in data.test_trees] data.test_trees = helper.sort_by(data.test_trees, roots_size) if FLAGS.use_gpu: config = None else: config = tf.ConfigProto(device_count={'GPU': 0}) if FLAGS.word_embed_model == constants.WORD2VEC: word_embeddings = Word2Vec(mode=FLAGS.word_embed_mode, dimensions=FLAGS.word_embedding_size) elif FLAGS.word_embed_model == constants.FASTTEXT: word_embeddings = FastText(mode=FLAGS.word_embed_mode, dimensions=FLAGS.word_embedding_size) else: # FLAGS.word_embed_model == constants.GLOVE word_embeddings = GloVe(mode=FLAGS.word_embed_mode, dimensions=FLAGS.word_embedding_size) g_tree = tf.Graph() with g_tree.as_default(): model = None if FLAGS.model == constants.DEEP_RNN: model = deepRNN(data, word_embeddings, FLAGS.model_name) elif FLAGS.model == constants.BATCH_TREE_RNN: model = treeRNN_batch(data, word_embeddings, FLAGS.model_name) elif FLAGS.model == constants.NEERBEK_TREE_RNN: model = treeRNN_neerbek(data, word_embeddings, FLAGS.model_name) elif FLAGS.model == constants.TREE_LSTM: model = treeLSTM(data, word_embeddings, FLAGS.model_name) elif FLAGS.model == constants.TRACKER_TREE_RNN: model = treeRNN_tracker(data, word_embeddings, FLAGS.model_name) elif FLAGS.model == constants.TRACKER_TREE_LSTM: model = treeLSTM_tracker(data, word_embeddings, FLAGS.model_name) elif FLAGS.model == constants.LSTM: model = LSTM(data, word_embeddings, FLAGS.model_name) with tf.Session(config=tf.ConfigProto( device_count={'GPU': 0})) as sess: saver = tf.train.Saver() model.load_best(sess, saver, "validation") x_train = np.array( model.get_representation(data.train_trees, sess)) y_train = np.array(get_labels(data.train_trees)) x_val = np.array(model.get_representation( data.val_trees, sess)) y_val = np.array(get_labels(data.val_trees)) x_test = np.array( model.get_representation(data.test_trees, sess)) y_test = np.array(get_labels(data.test_trees)) np.save( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_train', x_train) np.save( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_train', y_train) np.save( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_val', x_val) np.save( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_val', y_val) np.save( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_test', x_test) np.save( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_test', y_test) return { 'train': (x_train, y_train), 'val': (x_val, y_val), 'test': (x_test, y_test) }
def select_data(self, data, cut_off, cluster_predictions=None): roots_size = [tree_util.size_of_tree(root) for root in data] data = np.array(helper.sort_by(data, roots_size)) t = time() if cluster_predictions is None: # Get representations representations, predictions, labels, permutations = [], [], [], [] batch_size = 500 batches = helper.batches(data, batch_size, perm=False) pbar = tqdm( bar_format= '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} (batches: {n_fmt}/{total_fmt}) ', total=len(batches)) for i, batch in enumerate(batches): feed_dict, permuts = self.model.build_feed_dict(batch, sort=True) reps, labs = self.session.run( [self.model.sentence_representations, self.model.labels], feed_dict=feed_dict) representations.extend(reps) labels.extend(labs) permutations.extend(list(i * batch_size + np.array(permuts))) pbar.update(1) pbar.close() print() self.representations = np.array(representations)[permutations] self.labels = np.array(performance.get_prediction( np.array(labels)))[permutations] # Get clusters try_cluster = True tries = 10 while try_cluster: tries -= 1 self.cluster_predictions = self.cluster_model.cluster( self.representations) if np.bincount(self.cluster_predictions).max() <= 0.8 * len( self.representations) or tries >= 0: try_cluster = False else: self.cluster_predictions = cluster_predictions self.labels = tree_util.get_labels(data) # Get acc of clusters cluster_mfo = [] cluster_mfo_labels = [] for i in range(self.num_clusters): mfo, l = self.mfo(i) cluster_mfo.append((i, mfo)) cluster_mfo_labels.append((i, l)) # Return data cluster_mfo.sort(key=lambda el: el[1], reverse=True) helper._print(f'Cluster MFO scores:') for (k, mfo), (_, l) in zip(cluster_mfo, cluster_mfo_labels): helper._print( f'\tCluster {k}: {mfo}, highest label: {l}, size: {len(self.labels[self.cluster_predictions == k])}/{len(data)}' ) removed_percent = 0 data_to_use = [] for cluster, acc in cluster_mfo: new_percent = removed_percent + len( data[self.cluster_predictions == cluster]) / len(data) removed_percent = new_percent if acc < cut_off: data_to_use.extend(data[self.cluster_predictions == cluster]) helper._print( f'Done selecting data for training. Overall time used for selection is {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds' ) return data_to_use, self.cluster_predictions
def parse_trees(dataset="small", type='train', remove=False): # todo maybe change input param """ https://github.com/erickrf/treernn/blob/master/tree.py :param type: what dataset to use :return: a list of trees """ file = directories.TREES_DIRS[dataset] + '%s.txt' % type if not os.path.isdir(directories.TREES_DIRS[dataset]): os.makedirs(directories.TREES_DIRS[dataset]) if not os.path.isfile(file): if dataset == 'all': helper._print(f'Creating new {file}...') with open(file, 'w+') as f: for l in directories.TREES_ZIP_PATHS: smaller_tree_file = directories.TREES_DIRS[ l] + '%s.txt' % type helper._print(f'Merging from {smaller_tree_file}...') if not os.path.isfile(smaller_tree_file): helper._print( f'Extracting {directories.TREES_ZIP_PATHS[l]}...') with zipfile.ZipFile(directories.TREES_ZIP_PATHS[l], 'r') as zip: zip.extractall(path=directories.TREES_DIRS[l]) correct_labels(constants.TREE_LABELS[l], l) with open(smaller_tree_file, 'r+') as sf: for tree in sf: f.write(tree) elif dataset == 'small': helper._print( 'No small dataset. Try pulling from Git... Or make your own you lazy bastard!' ) else: helper._print( f'Extracting {directories.TREES_ZIP_PATHS[dataset]}...') with zipfile.ZipFile(directories.TREES_ZIP_PATHS[dataset], 'r') as zip: zip.extractall(path=directories.TREES_DIRS[dataset]) correct_labels(constants.TREE_LABELS[dataset], dataset) helper._print("Loading %s trees.." % type) with open(file, 'r') as fid: trees = [] lines = fid.readlines() pbar = tqdm( bar_format= '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ', total=len(lines)) for i, l in enumerate(lines): if (i + 1) % 1000 == 0: pbar.update(1000) trees.append(parse_tree(l)) pbar.update(len(lines) % 1000) pbar.close() print() sentence_length = [count_leaf(tree) for tree in trees] sentence_length = np.array(sentence_length) helper._print("Avg length:", np.average(sentence_length)) trees = np.array(trees) if remove: helper._print( "Shorten then 90 word:", int( np.sum(np.array(sentence_length) <= 90) / len(sentence_length) * 100), "%") helper._print("Ratio of removed labels:", ratio_of_labels(trees[np.array(sentence_length) > 90])) trees = np.array( helper.sort_by(trees[np.array(sentence_length) <= 90], sentence_length[np.array(sentence_length) <= 90])) return trees