예제 #1
0
파일: process.py 프로젝트: PeterDrake/sky
def get_network_mask(timestamp, exp_label, input_dir):
    """Returns the mask of a given timestamp from the network's output."""
    network_dir = RESULTS_DIR + '/' + exp_label + '/'
    args = read_parameters(network_dir)
    step_version = read_last_iteration_number(network_dir)
    layer_info = args['Layer info'].split()
    _, _, saver, _, x, y, _, _ = build_net(layer_info)
    with tf.Session() as sess:
        saver.restore(sess, network_dir + 'weights-' + str(step_version))
        img = load_inputs([timestamp], input_dir)
        mask = out_to_image(y.eval(feed_dict={x: img}))[0]
    return mask
예제 #2
0
파일: process.py 프로젝트: PeterDrake/sky
def process_network_masks(timestamps, exp_label, input_dir):
    """Processes images corresponding to a list of timestamps. Saves each mask in the network directory. Does NOT
	check to make sure that the image exists. This must be done by the user before calling this method."""
    network_dir = RESULTS_DIR + '/' + exp_label + '/'
    args = read_parameters(network_dir)
    step_version = read_last_iteration_number(network_dir)
    layer_info = args['Layer info'].split()
    _, _, saver, _, x, y, _, _ = build_net(layer_info)
    # masks = []
    with tf.Session() as sess:
        saver.restore(sess, network_dir + 'weights-' + str(step_version))
        for t in timestamps:
            inputs = load_inputs([t], input_dir)
            result = out_to_image(y.eval(feed_dict={x: inputs}))[0]
            # masks.append(result)
            save_network_mask(t, exp_label, result)
예제 #3
0
파일: parse.py 프로젝트: stephantul/diora
def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)

    # Parse
    diora = trainer.net.diora

    # Monkey patch parsing specific methods.
    override_init_with_batch(diora)
    override_inside_hook(diora)

    # Turn off outside pass.
    trainer.net.diora.outside = False
    # Eval mode.
    trainer.net.eval()

    # Topk predictor.
    parse_predictor = CKY(net=diora, word2idx=word2idx)
    batches = validation_iterator.get_iterator(random_seed=options.seed)

    logger.info('Beginning to parse.')

    with torch.no_grad():
        for i, batch_map in enumerate(batches):
            sentences = batch_map['sentences']
            batch_size = sentences.shape[0]
            length = sentences.shape[1]

            # Rather than skipping, just log the trees
            # (they are trivially easy to find).
            if length <= 2:
                for i in range(batch_size):
                    example_id = batch_map['example_ids'][i]
                    tokens = sentences[i].tolist()
                    words = [idx2word[idx] for idx in tokens]
                    if length == 2:
                        o = dict(example_id=example_id,
                                 tree=(words[0], words[1]))
                    elif length == 1:
                        o = dict(example_id=example_id, tree=words[0])
                    print(json.dumps(o))
                continue

            trainer.step(batch_map, train=False, compute_loss=False)
            trees = parse_predictor.parse_batch(batch_map)

            for ii, tr in enumerate(trees):
                example_id = batch_map['example_ids'][ii]
                s = [idx2word[idx] for idx in sentences[ii].tolist()]
                tr = replace_leaves(tr, s)
                o = dict(example_id=example_id, tree=tr)

                print(json.dumps(o))
예제 #4
0
def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    #print(validation_dataset['sentence1'][0],validation_dataset['example_ids'][0])
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)

    # Parse

    diora = trainer.net.encoder

    ## Monkey patch parsing specific methods.
    override_init_with_batch(diora)
    override_inside_hook(diora)

    ## Turn off outside pass.
    #trainer.net.encoder.outside = False

    ## Eval mode.
    trainer.net.eval()

    ## Parse predictor.
    parse_predictor = CKY(net=diora, word2idx=word2idx)

    batches = validation_iterator.get_iterator(random_seed=options.seed)

    output_path1 = os.path.abspath(os.path.join(options.experiment_path, 'parse_mnli1.jsonl'))
    output_path2 = os.path.abspath(os.path.join(options.experiment_path, 'parse_mnli2.jsonl'))

    logger.info('Beginning.')
    logger.info('Writing output to = {}'.format(output_path1))
    logger.info('Writing output to = {}'.format(output_path2))

    f = open(output_path1, 'w')

    with torch.no_grad():
        for i, batch_map in tqdm(enumerate(batches)):
            #print(batch_map.keys())
            sentences1 = batch_map['sentences_1']
            sentences2 = batch_map['sentences_2']
            #print(sentences.shape)
            batch_size = sentences1.shape[0]
            length = sentences1.shape[1]

            # Skip very short sentences.
            if length <= 2:
                continue

            _ = trainer.step(batch_map, train=False, compute_loss=False)

            trees1 = parse_predictor.parse_batch(sentences1)
            trees2 = parse_predictor.parse_batch(sentences2)
            #print(list(zip(trees1,trees2)))
            for ii,tree in enumerate(list(zip(trees1,trees2))):
                tr1,tr2 = tree[0],tree[1]
                example_id = batch_map['example_ids'][ii]
                #print(batch_map['example_ids'])
                s1 = [idx2word[idx] for idx in sentences1[ii].tolist()]
                s2 = [idx2word[idx] for idx in sentences2[ii].tolist()]
                tr1 = replace_leaves(tr1, s1)
                tr2 = replace_leaves(tr2, s2)
                if options.postprocess:
                    tr = postprocess(tr, s1)
                o = collections.OrderedDict(example_id=example_id, sentence1=tr1,sentence2=tr2)
                #print(o)
                #exit()

                f.write(json.dumps(o) + '\n')
  
    f.close()
예제 #5
0
def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)
    diora = trainer.net.diora
    tree_helper = TreeHelper(diora, word2idx)
    tree_helper.init(options)
    csv_helper = CSVHelper()

    ## Eval mode.
    trainer.net.eval()

    batches = validation_iterator.get_iterator(random_seed=options.seed)

    meta_output_path = os.path.abspath(os.path.join(options.experiment_path, 'vectors.csv'))
    vec_output_path = os.path.abspath(os.path.join(options.experiment_path, 'vectors.npy'))

    logger.info('Beginning.')
    logger.info('Writing vectors to = {}'.format(vec_output_path))
    logger.info('Writing metadata to = {}'.format(meta_output_path))

    f_csv = open(meta_output_path, 'w')
    f_vec = open(vec_output_path, 'ab')
    csv_helper.write_header(f_csv)

    with torch.no_grad():
        for i, batch_map in tqdm(enumerate(batches)):
            sentences = batch_map['sentences']
            batch_size = sentences.shape[0]
            length = sentences.shape[1]

            # Skip very short sentences.
            if length <= 2:
                continue

            _ = trainer.step(batch_map, train=False, compute_loss=False)

            if options.parse_mode == 'all-spans':
                for ii in range(batch_size):
                    example_id = batch_map['example_ids'][ii]
                    for level in range(length):
                        size = level + 1
                        for pos in range(length - level):
                            # metadata
                            csv_helper.write_row(f_csv,
                                collections.OrderedDict(
                                    example_id=example_id,
                                    position=str(pos),
                                    size=str(size)
                            ))
                inside_vectors = diora.inside_h.view(-1, options.hidden_dim)
                outside_vectors = diora.outside_h.view(-1, options.hidden_dim)

            else:
                trees, spans = tree_helper.get_trees_for_batch(batch_map, options)

                batch_index = []
                cell_index = []
                offset_cache = diora.index.get_offset(length)

                for ii, sp_lst in enumerate(spans):
                    example_id = batch_map['example_ids'][ii]
                    for pos, size in sp_lst:
                        # metadata
                        csv_helper.write_row(f_csv,
                            collections.OrderedDict(
                                example_id=example_id,
                                position=str(pos),
                                size=str(size)
                        ))
                        # for vectors
                        level = size - 1
                        cell = offset_cache[level] + pos
                        batch_index.append(ii)
                        cell_index.append(cell)

                inside_vectors = diora.inside_h[batch_index, cell_index]
                assert inside_vectors.shape == (len(batch_index), options.hidden_dim)
                outside_vectors = diora.outside_h[batch_index, cell_index]
                assert outside_vectors.shape == (len(batch_index), options.hidden_dim)

            vectors = np.concatenate([inside_vectors, outside_vectors], axis=1)
            np.savetxt(f_vec, vectors)

    f_csv.close()
    f_vec.close()
예제 #6
0
파일: parse.py 프로젝트: osagha/diora
def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)

    # Parse

    diora = trainer.net.diora

    ## Monkey patch parsing specific methods.
    override_init_with_batch(diora)
    override_inside_hook(diora)

    ## Turn off outside pass.
    trainer.net.diora.outside = False

    ## Eval mode.
    trainer.net.eval()

    ## Parse predictor.
    parse_predictor = CKY(net=diora, word2idx=word2idx)

    batches = validation_iterator.get_iterator(random_seed=options.seed)

    output_path = os.path.abspath(os.path.join(options.experiment_path, 'parse.jsonl'))

    logger.info('Beginning.')
    logger.info('Writing output to = {}'.format(output_path))

    f = open(output_path, 'w')

    with torch.no_grad():
        for i, batch_map in tqdm(enumerate(batches)):
            sentences = batch_map['sentences']
            batch_size = sentences.shape[0]
            length = sentences.shape[1]

            # Skip very short sentences.
            if length <= 2:
                continue

            _ = trainer.step(batch_map, train=False, compute_loss=False)

            trees = parse_predictor.parse_batch(batch_map)

            for ii, tr in enumerate(trees):
                example_id = batch_map['example_ids'][ii]
                s = [idx2word[idx] for idx in sentences[ii].tolist()]
                tr = replace_leaves(tr, s)
                if options.postprocess:
                    tr = postprocess(tr, s)
                o = collections.OrderedDict(example_id=example_id, tree=tr)

                f.write(json.dumps(o) + '\n')

    f.close()
예제 #7
0
def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)
    diora = trainer.net.diora

    # 1. Get all relevant phrase vectors.

    dtype = {
        'example_ids': 'list',
        'labels': 'list',
        'positions': 'list',
        'sizes': 'list',
        'phrases': 'list',
        'inside': 'torch',
        'outside': 'torch',
    }
    batch_recorder = BatchRecorder(dtype=dtype)

    ## Eval mode.
    trainer.net.eval()

    batches = validation_iterator.get_iterator(random_seed=options.seed)

    logger.info('Beginning to embed phrases.')

    with torch.no_grad():
        for i, batch_map in enumerate(batches):
            sentences = batch_map['sentences']
            batch_size = sentences.shape[0]
            length = sentences.shape[1]

            # Skips very short examples.
            if length <= 2:
                continue

            _ = trainer.step(batch_map, train=False, compute_loss=False)

            entity_labels = batch_map['entity_labels']
            if len(entity_labels) == 0:
                continue
            try:
                batch_index, positions, sizes, labels = get_cell_index(entity_labels)
            except:
                continue
            # Skip short phrases.
            batch_index = [x for x, y in zip(batch_index, sizes) if y >= 2]
            positions = [x for x, y in zip(positions, sizes) if y >= 2]
            labels = [x for x, y in zip(labels, sizes) if y >= 2]
            sizes = [y for y in sizes if y >= 2]

            cell_index = (batch_index, positions, sizes)

            batch_result = {}
            batch_result['example_ids'] = [batch_map['example_ids'][idx] for idx in cell_index[0]]
            batch_result['labels'] = labels
            batch_result['positions'] = cell_index[1]
            batch_result['sizes'] = cell_index[2]
            batch_result['phrases'] = get_many_phrases(sentences, *cell_index)
            batch_result['inside'] = get_many_cells(diora, diora.inside_h, *cell_index)
            batch_result['outside'] = get_many_cells(diora, diora.outside_h, *cell_index)

            batch_recorder.record(**batch_result)

    result = batch_recorder.get_flattened_result()

    # 2. Build an index of nearest neighbors.

    vectors = np.concatenate([result['inside'], result['outside']], axis=1)
    normalize_L2(vectors)

    index = Index(dim=vectors.shape[1])
    index.add(vectors)
    index.cache(vectors, options.k_candidates)

    # 3. Print a summary.

    example_ids = result['example_ids']
    phrases = result['phrases']
    labels  = result['labels']

    assert len(example_ids) == len(phrases)
    assert len(example_ids) == vectors.shape[0]

    def stringify(phrase):
        return ' '.join([idx2word[idx] for idx in phrase])

    prec_1 = []
    prec_10 = []
    prec_100 = []
    for i in range(vectors.shape[0]):
        topk = []
        corr_lab = 0
        for j, score in index.topk(i, options.k_candidates):
            # Skip same example.
            if example_ids[i] == example_ids[j]:
                continue
            # Skip string match.
            if phrases[i] == phrases[j]:
                continue
            topk.append((j, score))

            corr_lab += 1. * (labels[i] == labels[j])

            if len(topk) == 1:
                prec_1.append(corr_lab)
            elif len(topk) == 10:
                prec_10.append(corr_lab)
            elif len(topk) == 100:
                prec_100.append(corr_lab)

            if len(topk) == options.k_top:
                break
        assert len(topk) == options.k_top, 'Did not find enough valid candidates.'

        # Print.
        # print('[query] example_id={} phrase={} lab={}'.format(
        #   example_ids[i], stringify(phrases[i]),labels[i]))
        # for rank, (j, score) in enumerate(topk[:2]):
        #   print('rank={} score={:.3f} example_id={} phrase={} lab={}'.format(
        #       rank, score, example_ids[j], stringify(phrases[j]), labels[j]))
    print(np.mean(prec_1), np.mean(prec_10)/10)
예제 #8
0
def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)
    diora = trainer.net.diora

    # 1. Get all relevant phrase vectors.
    dtype = {
        'example_ids': 'list',
        'labels': 'list',
        'positions': 'list',
        'sizes': 'list',
        'phrases': 'list',
        'inside': 'torch',
        'outside': 'torch',
    }
    batch_recorder = BatchRecorder(dtype=dtype)
    # Eval mode.
    trainer.net.eval()

    batches = validation_iterator.get_iterator(random_seed=options.seed)

    logger.info('Beginning to embed phrases.')

    strings = []
    with torch.no_grad():
        for i, batch_map in enumerate(batches):
            sentences = batch_map['sentences']
            length = sentences.shape[1]

            # Skips very short examples.
            if length <= 2:
                continue
            strings.extend([
                "".join([idx2word[idx] for idx in x])
                for x in sentences.numpy()
            ])
            trainer.step(batch_map, train=False, compute_loss=False)

            batch_result = {}
            batch_result['inside'] = diora.inside_h[:, -1]
            batch_result['outside'] = diora.outside_h[:, -1]
            batch_recorder.record(**batch_result)

    result = batch_recorder.get_flattened_result()

    # 2. Build an index of nearest neighbors.
    vectors = np.concatenate([result['inside'], result['outside']], axis=1)
    print(len(strings), vectors.shape)
    r = Reach(vectors, strings)

    for s in strings:
        print(s)
        print(r.most_similar(s))