def get_network_mask(timestamp, exp_label, input_dir): """Returns the mask of a given timestamp from the network's output.""" network_dir = RESULTS_DIR + '/' + exp_label + '/' args = read_parameters(network_dir) step_version = read_last_iteration_number(network_dir) layer_info = args['Layer info'].split() _, _, saver, _, x, y, _, _ = build_net(layer_info) with tf.Session() as sess: saver.restore(sess, network_dir + 'weights-' + str(step_version)) img = load_inputs([timestamp], input_dir) mask = out_to_image(y.eval(feed_dict={x: img}))[0] return mask
def process_network_masks(timestamps, exp_label, input_dir): """Processes images corresponding to a list of timestamps. Saves each mask in the network directory. Does NOT check to make sure that the image exists. This must be done by the user before calling this method.""" network_dir = RESULTS_DIR + '/' + exp_label + '/' args = read_parameters(network_dir) step_version = read_last_iteration_number(network_dir) layer_info = args['Layer info'].split() _, _, saver, _, x, y, _, _ = build_net(layer_info) # masks = [] with tf.Session() as sess: saver.restore(sess, network_dir + 'weights-' + str(step_version)) for t in timestamps: inputs = load_inputs([t], input_dir) result = out_to_image(y.eval(feed_dict={x: inputs}))[0] # masks.append(result) save_network_mask(t, exp_label, result)
def run(options): logger = get_logger() validation_dataset = get_validation_dataset(options) validation_iterator = get_validation_iterator(options, validation_dataset) word2idx = validation_dataset['word2idx'] embeddings = validation_dataset['embeddings'] idx2word = {v: k for k, v in word2idx.items()} logger.info('Initializing model.') trainer = build_net(options, embeddings, validation_iterator) # Parse diora = trainer.net.diora # Monkey patch parsing specific methods. override_init_with_batch(diora) override_inside_hook(diora) # Turn off outside pass. trainer.net.diora.outside = False # Eval mode. trainer.net.eval() # Topk predictor. parse_predictor = CKY(net=diora, word2idx=word2idx) batches = validation_iterator.get_iterator(random_seed=options.seed) logger.info('Beginning to parse.') with torch.no_grad(): for i, batch_map in enumerate(batches): sentences = batch_map['sentences'] batch_size = sentences.shape[0] length = sentences.shape[1] # Rather than skipping, just log the trees # (they are trivially easy to find). if length <= 2: for i in range(batch_size): example_id = batch_map['example_ids'][i] tokens = sentences[i].tolist() words = [idx2word[idx] for idx in tokens] if length == 2: o = dict(example_id=example_id, tree=(words[0], words[1])) elif length == 1: o = dict(example_id=example_id, tree=words[0]) print(json.dumps(o)) continue trainer.step(batch_map, train=False, compute_loss=False) trees = parse_predictor.parse_batch(batch_map) for ii, tr in enumerate(trees): example_id = batch_map['example_ids'][ii] s = [idx2word[idx] for idx in sentences[ii].tolist()] tr = replace_leaves(tr, s) o = dict(example_id=example_id, tree=tr) print(json.dumps(o))
def run(options): logger = get_logger() validation_dataset = get_validation_dataset(options) #print(validation_dataset['sentence1'][0],validation_dataset['example_ids'][0]) validation_iterator = get_validation_iterator(options, validation_dataset) word2idx = validation_dataset['word2idx'] embeddings = validation_dataset['embeddings'] idx2word = {v: k for k, v in word2idx.items()} logger.info('Initializing model.') trainer = build_net(options, embeddings, validation_iterator) # Parse diora = trainer.net.encoder ## Monkey patch parsing specific methods. override_init_with_batch(diora) override_inside_hook(diora) ## Turn off outside pass. #trainer.net.encoder.outside = False ## Eval mode. trainer.net.eval() ## Parse predictor. parse_predictor = CKY(net=diora, word2idx=word2idx) batches = validation_iterator.get_iterator(random_seed=options.seed) output_path1 = os.path.abspath(os.path.join(options.experiment_path, 'parse_mnli1.jsonl')) output_path2 = os.path.abspath(os.path.join(options.experiment_path, 'parse_mnli2.jsonl')) logger.info('Beginning.') logger.info('Writing output to = {}'.format(output_path1)) logger.info('Writing output to = {}'.format(output_path2)) f = open(output_path1, 'w') with torch.no_grad(): for i, batch_map in tqdm(enumerate(batches)): #print(batch_map.keys()) sentences1 = batch_map['sentences_1'] sentences2 = batch_map['sentences_2'] #print(sentences.shape) batch_size = sentences1.shape[0] length = sentences1.shape[1] # Skip very short sentences. if length <= 2: continue _ = trainer.step(batch_map, train=False, compute_loss=False) trees1 = parse_predictor.parse_batch(sentences1) trees2 = parse_predictor.parse_batch(sentences2) #print(list(zip(trees1,trees2))) for ii,tree in enumerate(list(zip(trees1,trees2))): tr1,tr2 = tree[0],tree[1] example_id = batch_map['example_ids'][ii] #print(batch_map['example_ids']) s1 = [idx2word[idx] for idx in sentences1[ii].tolist()] s2 = [idx2word[idx] for idx in sentences2[ii].tolist()] tr1 = replace_leaves(tr1, s1) tr2 = replace_leaves(tr2, s2) if options.postprocess: tr = postprocess(tr, s1) o = collections.OrderedDict(example_id=example_id, sentence1=tr1,sentence2=tr2) #print(o) #exit() f.write(json.dumps(o) + '\n') f.close()
def run(options): logger = get_logger() validation_dataset = get_validation_dataset(options) validation_iterator = get_validation_iterator(options, validation_dataset) word2idx = validation_dataset['word2idx'] embeddings = validation_dataset['embeddings'] idx2word = {v: k for k, v in word2idx.items()} logger.info('Initializing model.') trainer = build_net(options, embeddings, validation_iterator) diora = trainer.net.diora tree_helper = TreeHelper(diora, word2idx) tree_helper.init(options) csv_helper = CSVHelper() ## Eval mode. trainer.net.eval() batches = validation_iterator.get_iterator(random_seed=options.seed) meta_output_path = os.path.abspath(os.path.join(options.experiment_path, 'vectors.csv')) vec_output_path = os.path.abspath(os.path.join(options.experiment_path, 'vectors.npy')) logger.info('Beginning.') logger.info('Writing vectors to = {}'.format(vec_output_path)) logger.info('Writing metadata to = {}'.format(meta_output_path)) f_csv = open(meta_output_path, 'w') f_vec = open(vec_output_path, 'ab') csv_helper.write_header(f_csv) with torch.no_grad(): for i, batch_map in tqdm(enumerate(batches)): sentences = batch_map['sentences'] batch_size = sentences.shape[0] length = sentences.shape[1] # Skip very short sentences. if length <= 2: continue _ = trainer.step(batch_map, train=False, compute_loss=False) if options.parse_mode == 'all-spans': for ii in range(batch_size): example_id = batch_map['example_ids'][ii] for level in range(length): size = level + 1 for pos in range(length - level): # metadata csv_helper.write_row(f_csv, collections.OrderedDict( example_id=example_id, position=str(pos), size=str(size) )) inside_vectors = diora.inside_h.view(-1, options.hidden_dim) outside_vectors = diora.outside_h.view(-1, options.hidden_dim) else: trees, spans = tree_helper.get_trees_for_batch(batch_map, options) batch_index = [] cell_index = [] offset_cache = diora.index.get_offset(length) for ii, sp_lst in enumerate(spans): example_id = batch_map['example_ids'][ii] for pos, size in sp_lst: # metadata csv_helper.write_row(f_csv, collections.OrderedDict( example_id=example_id, position=str(pos), size=str(size) )) # for vectors level = size - 1 cell = offset_cache[level] + pos batch_index.append(ii) cell_index.append(cell) inside_vectors = diora.inside_h[batch_index, cell_index] assert inside_vectors.shape == (len(batch_index), options.hidden_dim) outside_vectors = diora.outside_h[batch_index, cell_index] assert outside_vectors.shape == (len(batch_index), options.hidden_dim) vectors = np.concatenate([inside_vectors, outside_vectors], axis=1) np.savetxt(f_vec, vectors) f_csv.close() f_vec.close()
def run(options): logger = get_logger() validation_dataset = get_validation_dataset(options) validation_iterator = get_validation_iterator(options, validation_dataset) word2idx = validation_dataset['word2idx'] embeddings = validation_dataset['embeddings'] idx2word = {v: k for k, v in word2idx.items()} logger.info('Initializing model.') trainer = build_net(options, embeddings, validation_iterator) # Parse diora = trainer.net.diora ## Monkey patch parsing specific methods. override_init_with_batch(diora) override_inside_hook(diora) ## Turn off outside pass. trainer.net.diora.outside = False ## Eval mode. trainer.net.eval() ## Parse predictor. parse_predictor = CKY(net=diora, word2idx=word2idx) batches = validation_iterator.get_iterator(random_seed=options.seed) output_path = os.path.abspath(os.path.join(options.experiment_path, 'parse.jsonl')) logger.info('Beginning.') logger.info('Writing output to = {}'.format(output_path)) f = open(output_path, 'w') with torch.no_grad(): for i, batch_map in tqdm(enumerate(batches)): sentences = batch_map['sentences'] batch_size = sentences.shape[0] length = sentences.shape[1] # Skip very short sentences. if length <= 2: continue _ = trainer.step(batch_map, train=False, compute_loss=False) trees = parse_predictor.parse_batch(batch_map) for ii, tr in enumerate(trees): example_id = batch_map['example_ids'][ii] s = [idx2word[idx] for idx in sentences[ii].tolist()] tr = replace_leaves(tr, s) if options.postprocess: tr = postprocess(tr, s) o = collections.OrderedDict(example_id=example_id, tree=tr) f.write(json.dumps(o) + '\n') f.close()
def run(options): logger = get_logger() validation_dataset = get_validation_dataset(options) validation_iterator = get_validation_iterator(options, validation_dataset) word2idx = validation_dataset['word2idx'] embeddings = validation_dataset['embeddings'] idx2word = {v: k for k, v in word2idx.items()} logger.info('Initializing model.') trainer = build_net(options, embeddings, validation_iterator) diora = trainer.net.diora # 1. Get all relevant phrase vectors. dtype = { 'example_ids': 'list', 'labels': 'list', 'positions': 'list', 'sizes': 'list', 'phrases': 'list', 'inside': 'torch', 'outside': 'torch', } batch_recorder = BatchRecorder(dtype=dtype) ## Eval mode. trainer.net.eval() batches = validation_iterator.get_iterator(random_seed=options.seed) logger.info('Beginning to embed phrases.') with torch.no_grad(): for i, batch_map in enumerate(batches): sentences = batch_map['sentences'] batch_size = sentences.shape[0] length = sentences.shape[1] # Skips very short examples. if length <= 2: continue _ = trainer.step(batch_map, train=False, compute_loss=False) entity_labels = batch_map['entity_labels'] if len(entity_labels) == 0: continue try: batch_index, positions, sizes, labels = get_cell_index(entity_labels) except: continue # Skip short phrases. batch_index = [x for x, y in zip(batch_index, sizes) if y >= 2] positions = [x for x, y in zip(positions, sizes) if y >= 2] labels = [x for x, y in zip(labels, sizes) if y >= 2] sizes = [y for y in sizes if y >= 2] cell_index = (batch_index, positions, sizes) batch_result = {} batch_result['example_ids'] = [batch_map['example_ids'][idx] for idx in cell_index[0]] batch_result['labels'] = labels batch_result['positions'] = cell_index[1] batch_result['sizes'] = cell_index[2] batch_result['phrases'] = get_many_phrases(sentences, *cell_index) batch_result['inside'] = get_many_cells(diora, diora.inside_h, *cell_index) batch_result['outside'] = get_many_cells(diora, diora.outside_h, *cell_index) batch_recorder.record(**batch_result) result = batch_recorder.get_flattened_result() # 2. Build an index of nearest neighbors. vectors = np.concatenate([result['inside'], result['outside']], axis=1) normalize_L2(vectors) index = Index(dim=vectors.shape[1]) index.add(vectors) index.cache(vectors, options.k_candidates) # 3. Print a summary. example_ids = result['example_ids'] phrases = result['phrases'] labels = result['labels'] assert len(example_ids) == len(phrases) assert len(example_ids) == vectors.shape[0] def stringify(phrase): return ' '.join([idx2word[idx] for idx in phrase]) prec_1 = [] prec_10 = [] prec_100 = [] for i in range(vectors.shape[0]): topk = [] corr_lab = 0 for j, score in index.topk(i, options.k_candidates): # Skip same example. if example_ids[i] == example_ids[j]: continue # Skip string match. if phrases[i] == phrases[j]: continue topk.append((j, score)) corr_lab += 1. * (labels[i] == labels[j]) if len(topk) == 1: prec_1.append(corr_lab) elif len(topk) == 10: prec_10.append(corr_lab) elif len(topk) == 100: prec_100.append(corr_lab) if len(topk) == options.k_top: break assert len(topk) == options.k_top, 'Did not find enough valid candidates.' # Print. # print('[query] example_id={} phrase={} lab={}'.format( # example_ids[i], stringify(phrases[i]),labels[i])) # for rank, (j, score) in enumerate(topk[:2]): # print('rank={} score={:.3f} example_id={} phrase={} lab={}'.format( # rank, score, example_ids[j], stringify(phrases[j]), labels[j])) print(np.mean(prec_1), np.mean(prec_10)/10)
def run(options): logger = get_logger() validation_dataset = get_validation_dataset(options) validation_iterator = get_validation_iterator(options, validation_dataset) word2idx = validation_dataset['word2idx'] embeddings = validation_dataset['embeddings'] idx2word = {v: k for k, v in word2idx.items()} logger.info('Initializing model.') trainer = build_net(options, embeddings, validation_iterator) diora = trainer.net.diora # 1. Get all relevant phrase vectors. dtype = { 'example_ids': 'list', 'labels': 'list', 'positions': 'list', 'sizes': 'list', 'phrases': 'list', 'inside': 'torch', 'outside': 'torch', } batch_recorder = BatchRecorder(dtype=dtype) # Eval mode. trainer.net.eval() batches = validation_iterator.get_iterator(random_seed=options.seed) logger.info('Beginning to embed phrases.') strings = [] with torch.no_grad(): for i, batch_map in enumerate(batches): sentences = batch_map['sentences'] length = sentences.shape[1] # Skips very short examples. if length <= 2: continue strings.extend([ "".join([idx2word[idx] for idx in x]) for x in sentences.numpy() ]) trainer.step(batch_map, train=False, compute_loss=False) batch_result = {} batch_result['inside'] = diora.inside_h[:, -1] batch_result['outside'] = diora.outside_h[:, -1] batch_recorder.record(**batch_result) result = batch_recorder.get_flattened_result() # 2. Build an index of nearest neighbors. vectors = np.concatenate([result['inside'], result['outside']], axis=1) print(len(strings), vectors.shape) r = Reach(vectors, strings) for s in strings: print(s) print(r.most_similar(s))