示例#1
0
    def _build_indices(cls, seq_embedder, sentences, n_trees, save_dir):
        makedirs(save_dir)  # make sure directory exists
        batch_size = 128  # number of sentences to embed at a time
        batches_per_index = 4096  # number of batches per index
        # 128 * 4096 = 524288 sentences per index

        batches = list(chunks(sentences, n=batch_size))
        sharded_batches = list(chunks(batches, n=batches_per_index))
        num_shards = len(sharded_batches)
        embed_dim = seq_embedder.embed_dim

        for s, batches_s in enumerate(sharded_batches):
            print 'Building shard {}/{}'.format(s + 1, num_shards)
            index = cls._init_index(embed_dim)
            i = 0
            for batch in verboserate(
                    batches_s,
                    desc='Embedding sentences (batch_size={})'.format(
                        batch_size)):
                sent_embeds = seq_embedder.embed(batch)
                sent_embeds = sent_embeds.data.cpu().numpy(
                )  # (batch_size, embed_dim)
                for sent_embed in sent_embeds:
                    # sent_embed has shape (embed_dim,)
                    index.add_item(i, sent_embed)
                    i += 1

            with timer('Constructing trees'):
                index.build(n_trees)

            with timer('Saving shard to disk'):
                index.save(cls._shard_path(save_dir, s))
示例#2
0
def similar_size_batches(examples, batch_size, size=lambda x: len(x.target_words)):
    """Create similar-sized batches of EditExamples.

    By default, elements with similar len('source_words') are batched together.
    See editor.py / EditExample.

    Args:
        examples (list[EditExample])
        batch_size (int)
        size (Callable[[EditExample], int])

    Returns:
        list[list[EditExample]]
    """
    assert batch_size >= 1
    sorted_examples = sorted(examples, key=size)
    batches = list(chunks(sorted_examples, batch_size))
    random.shuffle(batches)  # in-place

    # report savings
    suboptimal_batches = list(chunks(examples, batch_size))

    total_cost = lambda batches: batch_size * sum(max(size(b) for b in batch) for batch in batches)
    naive_cost = total_cost(suboptimal_batches)
    improved_cost = total_cost(batches)
    optimal_cost = sum(size(ex) for ex in examples)

    print 'Optimized batches: reduced cost from {naive} (naive) to {improved} ({reduction}% reduction).\n' \
          'Optimal (batch_size=1) would be {optimal}.'.format(naive=naive_cost, improved=improved_cost,
                                                              reduction=float(naive_cost - improved_cost) / naive_cost,
                                                              optimal=optimal_cost)

    return batches
示例#3
0
    def _compute_metrics(cls,
                         model,
                         ts,
                         examples,
                         eval_size=1000,
                         batch_size=256):

        examples_ = sample_if_large(examples, max_size=eval_size)

        losses, weights = [], []
        for batch in chunks(examples_, batch_size):
            # compute loss
            batch_loss = model.loss(batch, ts)
            losses.append(batch_loss.data[0])
            weights.append(len(batch))
        losses, weights = np.array(losses), np.array(weights)
        loss = np.sum(losses * weights) / np.sum(weights)

        # compute perplexity
        entropy = 0.0
        num_words = 0
        for batch in chunks(examples_, batch_size):
            # change base
            losses = model.per_instance_losses(batch)  # -log_e_x
            losses = losses.data.cpu().numpy()
            losses_log_2 = losses / np.log(2.0)

            # normalize log_p by sentence length
            lengths = np.array([len(ex) + 1 for ex in batch])
            entropy += np.sum(losses_log_2)
            num_words += sum(lengths)

        pp = 2.0**(1.0 / num_words * entropy)

        return round(loss, 5), round(pp, 55)
示例#4
0
    def edit(self, examples, max_seq_length=150, beam_size=5, batch_size=64, constrain_vocab=False, verbose=False):
        """Performs edits on a batch of source sentences.

        Args:
            examples (list[EditExample])
            max_seq_length (int): max # timesteps to generate for
            beam_size (int): for beam decoding
            batch_size (int): max number of examples to pass into the RNN decoder at a time.
                The total # examples decoded in parallel = batch_size / beam_size.
            constrain_vocab (bool):
default is False

        Returns:
            beam_list (list[list[list[unicode]]]): a batch of beams.
            edit_traces (list[EditTrace])
        """
        self.eval()  # set to evaluation mode, for dropout to work correctly
        beam_list = []
        edit_traces = []

        batches = chunks(examples, batch_size / beam_size)
        batches = verboserate(batches, desc='Decoding examples') if verbose else batches
        for batch in batches:
            beams, traces = self._edit_batch(batch, max_seq_length, beam_size, constrain_vocab)
            beam_list.extend(beams)
            edit_traces.extend(traces)
        self.train()  # set back to train mode
        return beam_list, edit_traces
示例#5
0
    def _compute_metrics(cls, editor, examples, num_evaluate_examples, noiser, batch_size=256, edit_dropout=False, draw_samples=False):
        with random_seed(0):
            sample = sample_if_large(examples, num_evaluate_examples, replace=False)
        if edit_dropout:
            noised_sample = noiser(sample)
        else:
            noised_sample = sample

        # compute loss and log to TensorBoard
        # need to break the sample into batches, in case the sample is too large to fit in GPU memory
        losses, weights = [], []
        for batch in chunks(noised_sample, batch_size):
            weights.append(len(batch))
            loss_var, _, _ = editor.loss(batch, draw_samples)
            losses.append(loss_var.data[0])
        losses, weights = np.array(losses), np.array(weights)
        loss = np.sum(losses * weights) / np.sum(weights)  # weighted average

        # compute BLEU score and log to TensorBoard
        outputs, edit_traces = editor.edit(noised_sample)
        bleus = []
        for ex, output in izip(noised_sample, outputs):
            # outputs is a list(over batches)[ list(over beams) [ list(over tokens) [ unicode ] ] ] object.
            bleus.append(bleu(ex.target_words, output[0]))
        avg_bleu = np.mean(bleus)
        return loss, avg_bleu, edit_traces
def output_file(pickle_path):
    # for pickle_path in tqdm(tr_files, total = len(tr_files)):
    #     with open(str(pickle_path), 'rb') as f:
    #         result = pickle.load(f)# result: {(name_of_file, total_line_num) : [ExampleLines]}
    #     f.close()
    write_dir = pathlib2.Path.cwd() / 'github_data' / 'neural_ret_files' / 'train'

    df = pd.read_csv(pickle_path, skiprows=2, header=None, names=[0, 1], dtype=str).fillna(NO_CONTEXT_WORD)
    df[0] = df[0].apply(lambda x: tokenize_fine_grained(x))
    # df[0] = df[0].apply(lambda x: preprocess_tokens(x, MAX_LINE_LENGTH))
    df[1] = df[1].apply(lambda x: tokenize_fine_grained(x))
    max_seq_length = lambda ex: max(max(len(seq) for seq in ex.input_words), len(ex.target_words))

    try:
        ex = list(map(lambda x: EditExample(x[0], x[1]), zip(df[0].tolist(), df[1].tolist())))
        # skip sequences that are too long, because they use up memory

        ex = list(ifilterfalse(lambda x: max_seq_length(x) > 150, ex))
        # examples[(str(line).split('/')[-1], len(ex))] = ex
        result = {(str(pickle_path).split('/')[-1], len(ex)): ex}
        k = str(pickle_path).split('/')[-1].split('.')[0]

        k = list(result.keys())
        val = ex
        name, l = k[0]

        # try:
        new_vecs = None
        for batch in chunks(val, 32):  # loop over line numbers in file (get batches from file in order)
            # preprocess lines (includes tokenize_fine_grained
            # error checking and remove those lines from grabbing below
            # if line is bad, remove line from v which we use below so that idx below and idx in new_vecs match
            encin = ret_model.encode(batch, train_mode=False).data.cpu().numpy()
            # for vec in encin:
            #     new_vecs.append(vec)
            new_vecs = np.vstack([new_vecs, encin]) if new_vecs is not None else encin  # X --> x_i find closest in X

        ne = NearestNeighbors(10, n_jobs = 32, metric = 'minkowski')  # n_jobs=32
        ne.fit(new_vecs)
        neighbors = ne.kneighbors()[1]
        new_repo = pd.DataFrame(np.array([int(l)] + [None] * 11).reshape(1, -1))
        for idx, row in enumerate(neighbors):
            filtered_idx = row[np.where((row < (idx - 2)) | (row > (idx + 2)))[0]][:5]
            retrieved_lines = list(pd.DataFrame([(' '.join(val[ret_idx].input_words[0]),
                                                  ' '.join(val[ret_idx].target_words)) for ret_idx in
                                                 filtered_idx]).values.flatten())  # .reshape(1, -1)

            full_line = pd.DataFrame(np.array(
                [' '.join(val[idx].input_words[0]), ' '.join(val[idx].target_words)] + retrieved_lines).reshape(1, -1))
            new_repo = pd.concat([new_repo, full_line], axis=0)
        # new_repo.head()

        new_repo.to_csv(str(write_dir / pickle_path), header=None, index=None)

        # total_threads[0] = total_threads[0] - 1

    except Exception as e:
        print e
        print 'bad formatting in file ' + str(pickle_path).split('/')[-1]
        print pickle_path
示例#7
0
 def ret_and_make_ex(self, input, lsh, ex_list, startat, train_mode=True):
     ret_list = []
     for batch in chunks(input, 128):
         idxlist = self.ret_idx(batch, lsh, train_mode=train_mode)
         ret_tmp = [ex_list[idx[startat]] for idx in idxlist]
         ret_list.extend(ret_tmp)
     return self.make_editexamples(ret_list, input)
示例#8
0
 def batch_embed(self, exes, train_mode=True):
     ret_list = []
     for batch in chunks(exes, 128):
         encin = self.encode(batch, train_mode).data.cpu().numpy()
         for vec in encin:
             ret_list.append(vec)
     return ret_list
示例#9
0
 def per_instance_losses(self, examples, draw_samples=False, batch_size=128):
     """Compute per-instance losses."""
     per_instance_loss_list = []
     for batch in chunks(examples, batch_size):
         editor_input = self.preprocess(batch)
         encoder_output = self.encoder(editor_input.encoder_input, draw_samples)
         ilosses = self.train_decoder.per_instance_losses(encoder_output, editor_input.train_decoder_input)
         per_instance_loss_list.extend([loss.data.cpu().numpy()[0] for loss in ilosses])
     return per_instance_loss_list
示例#10
0
 def edit(self, examples, max_seq_length=35, beam_size=5, batch_size=1024):
     """Add one argument random_edit_vector wich enforce edition with a random vector."""
     logging.debug("Performing an edit on {} examples:\n {}".format(len(examples), examples))
     beam_list = []
     edit_traces = []
     for batch in chunks(examples, batch_size / beam_size):
         beams, traces = self._edit_batch(batch, max_seq_length, beam_size)
         beam_list.extend(beams)
         edit_traces.extend(traces)
     return beam_list, edit_traces
示例#11
0
    def _recover_sequences(cls, states_over_time, beam_size, top_k):
        # create decoder_traces
        ex_idx_to_beam_traces = defaultdict(list)
        for t, states in enumerate(states_over_time):
            assert len(states) % beam_size == 0
            beams = list(chunks(states, beam_size))
            for ex_idx, beam in enumerate(beams):
                trace = BeamTrace(beam, top_k)
                ex_idx_to_beam_traces[ex_idx].append(trace)

        decoder_traces = []
        for ex_idx in range(max(ex_idx_to_beam_traces.keys()) + 1):
            beam_traces = ex_idx_to_beam_traces[ex_idx]
            decoder_traces.append(BeamDecoderTrace(beam_traces))

        final_state_beams = list(chunks(states_over_time[-1], beam_size))
        output_beams = [[state.token_sequence for state in state_beam] for state_beam in final_state_beams]

        return output_beams, decoder_traces
    def ret_and_make_ex(self, input, lsh, ex_list, startat, train_mode=True):
        ret_list = []
        dist_list = []
        for batch in chunks(input, 128):
            idxlist, dist = self.ret_idx(batch, lsh, train_mode=train_mode)
            ret_tmp = [ex_list[idx[startat]] for idx in idxlist]
            dist_tmp = [d[0] for d in dist]
            ret_list.extend(ret_tmp)
            dist_list.extend(dist_tmp)

        edit_examples = self.make_editexamples(ret_list, input)
        for i, ex in enumerate(edit_examples):
            ex.dist = dist_list[i]
        return edit_examples
示例#13
0
    def launch(self, example_uids):
        """Launch task.

        Args:
            example_uids (list[str]): list of example_uids to launch the task with

        """
        batches = list(chunks(example_uids, self._batch_size))

        total_hits = len(batches)
        assert isinstance(self._price_per_hit, float)
        total_cost = total_hits * self._price_per_hit

        print('Launching {} HITs (${}). Type Enter to continue.'.format(total_hits, total_cost))
        input()

        parallel_call(self.create_hit, batches)
示例#14
0
 def get_vectors(self, tset):
     """
     :param tset: list of training examples
     :return: vec_list (joint encoding) and vec_list_in (context encoding)
     """
     vec_list = []
     vec_list_in = []
     for titem in chunks(tset, 128):
         edit_proc = self.preprocess(titem, volatile=True)
         agenda_out = self.encoder.target_out(edit_proc.encoder_input)
         agenda_in, _ = self.encoder.ctx_code_out(edit_proc.encoder_input)
         amat = agenda_out.data.cpu().numpy()
         amat_in = agenda_in.data.cpu().numpy()
         for i in range(amat.shape[0]):
             avec = amat[i] + amat_in[i]
             anorm = np.linalg.norm(avec)
             vec_list.append(avec / anorm)
             vec_list_in.append(amat_in[i] / np.linalg.norm(amat_in[i]))
     return vec_list, vec_list_in
示例#15
0
    def edit(self, examples, max_seq_length=35, beam_size=10, batch_size=256):
        """Performs edits on a batch of source sentences.

        Args:
            examples (list[EditExample])
            max_seq_length (int): max # timesteps to generate for
            beam_size (int): for beam decoding
            batch_size (int): max number of examples to pass into the RNN decoder at a time.
                The total # examples decoded in parallel = batch_size / beam_size.

        Returns:
            beam_list (list[list[list[unicode]]]): a batch of beams.
            edit_traces (list[EditTrace])
        """
        beam_list = []
        edit_traces = []
        for batch in chunks(examples, batch_size / beam_size):
            beams, traces = self._edit_batch(batch, max_seq_length, beam_size)
            beam_list.extend(beams)
            edit_traces.extend(traces)
        return beam_list, edit_traces
示例#16
0
    def from_sentences(self, query_sentences, k):
        query_embeds = self.seq_embedder.embed(
            query_sentences)  # (num_queries, embed_dim)
        query_embeds_normed = self.normalize(query_embeds)

        neighbors_dict = defaultdict(list)

        batch_size = 128
        target_batches = list(chunks(self.sentences, n=batch_size))
        for target_batch in verboserate(
                target_batches, desc='Embedding target sentences (batched)'):
            target_embeds = self.seq_embedder.embed(
                target_batch)  # (batch_size, embed_dim)
            target_embeds_normed = self.normalize(target_embeds)

            # NOTE: we are actually computing sqrt(2 - 2 * cos(theta)), not theta

            # <a, b> = ||a|| ||b|| cos(theta) = cos(theta)
            cos_thetas_batch = torch.mm(query_embeds_normed,
                                        target_embeds_normed.transpose(
                                            0, 1))  # (num_queries, batch_size)
            scores_batch = torch.sqrt(2 - 2 * cos_thetas_batch)

            scores_batch = scores_batch.data.cpu().numpy()

            for i, query in enumerate(query_sentences):
                for j, target in enumerate(target_batch):
                    score = scores_batch[i, j]
                    neighbors_dict[tuple(query)].append((target, score))

        neighbors_batch = []
        for query in query_sentences:
            neighbors = neighbors_dict[tuple(query)]
            neighbors = sorted(neighbors,
                               key=lambda pair: pair[1],
                               reverse=True)
            neighbors = neighbors[:k]
            neighbors_batch.append(neighbors)

        return neighbors_batch
示例#17
0
    def _compute_metrics(cls, editor, examples, num_evaluate_examples,
                         batch_size):
        """

        Args:
            editor (Editor)
            examples (list[EditExample])
            num_evaluate_examples (int)
            batch_size (int)

        Returns:
            stats (dict[str, float])
            edit_traces (list[EditTrace]): of length num_evaluate_examples
            loss_traces (list[LossTrace]): of length num_evaluate_examples

        """
        sample = sample_if_large(examples,
                                 num_evaluate_examples,
                                 replace=False)

        # compute loss
        # need to break the sample into batches, in case the sample is too large to fit in GPU memory
        losses, loss_traces, weights, enc_losses = [], [], [], []

        for batch in verboserate(chunks(sample, batch_size),
                                 desc='Computing loss on examples'):
            weights.append(len(batch))
            loss_var, loss_trace_batch, enc_loss = editor.loss(batch)

            # convert loss Variable into float
            loss_val = loss_var.data[0]
            assert isinstance(loss_val, float)
            losses.append(loss_val)
            enc_losses.append(enc_loss)

            loss_traces.extend(loss_trace_batch)

        losses, weights = np.array(losses), np.array(weights)
        loss = np.sum(losses * weights) / np.sum(weights)  # weighted average
        enc_loss = np.sum(np.array(enc_losses) * weights) / np.sum(weights)

        punct_table = dict.fromkeys(
            i for i in xrange(sys.maxunicode)
            if unicodedata.category(unichr(i)).startswith('P'))

        def remove_punct(s):
            new_s = []
            for t in s:
                t = unicode(t).translate(punct_table)
                if t != '':
                    new_s.append(t)
            return new_s

        metrics = {
            'bleu': (bleu, max),
            'edit_dist': (lambda s, t: edit_dist(s, t)[0] / len(s)
                          if len(s) > 0 else len(t), min),
            'exact_match':
            (lambda s, t: 1.0
             if remove_punct(s) == remove_punct(t) else 0.0, max)
        }

        top_results = defaultdict(list)
        top5_results = defaultdict(list)

        # compute predictions
        beams, edit_traces = editor.edit(sample,
                                         batch_size=batch_size,
                                         max_seq_length=150,
                                         verbose=True)
        for ex, beam in izip(sample, beams):
            top = beam[0]
            top5 = beam[:5]
            target = ex.target_words
            for name, (fxn, best) in metrics.items():
                top_results[name].append(fxn(top, target))
                top5_results[name].append(
                    best(fxn(predict, target) for predict in top5))

        # compute averages
        stats_top = {name: np.mean(vals) for name, vals in top_results.items()}
        stats_top5 = {
            '{}_top5'.format(name): np.mean(vals)
            for name, vals in top5_results.items()
        }

        # combine into a single stats object
        stats = {'loss': loss, 'enc_loss': enc_loss}
        stats.update(stats_top)
        stats.update(stats_top5)

        return stats, edit_traces, loss_traces
src_dir = os.environ['COPY_EDIT_DATA'] + 'edit_runs/7'  #for codalab
load_expt = RetrieveEditTrainingRun(config, src_dir)

###
# retedit model
import numpy as np

ret_model = load_expt.editor.ret_model
# edit_model = load_expt.editor.edit_model # since we only care about the retriever here
examples = load_expt._examples

from gtd.utils import chunks
from tqdm import tqdm

new_vecs = []
for batch in tqdm(chunks(examples.train, 32), total=len(examples.train) / 32):
    encin = ret_model.encode(batch, train_mode=False).data.cpu().numpy()
    for vec in encin:
        new_vecs.append(vec)
    del encin

new_lsh = ret_model.make_lsh(new_vecs)

eval_num = 500
validation_files = list((pathlib2.Path.cwd() / 'github_data' /
                         'processed_repo_pkl').glob('.pickle'))

# valid_eval = ret_model.ret_and_make_ex(examples.valid[0:eval_num], new_lsh, examples.train, 0, train_mode=False)
# # beam_list, edit_traces = edit_model.edit(valid_eval) # since we only care about ret
#
# ### other
示例#19
0
config = Config.from_file('editor_code/configs/editor/github.txt')
src_dir = os.environ['COPY_EDIT_DATA'] + '/edit_runs/0'  #for codalab
load_expt = RetrieveEditTrainingRun(config, src_dir)

import numpy as np

vae_editor = load_expt.editor.vae_model
ret_model = load_expt.editor.ret_model
edit_model = load_expt.editor.edit_model
examples = load_expt._examples

from gtd.utils import chunks
from tqdm import tqdm

new_vecs = []
for batch in tqdm(chunks(examples.train, 32), total=len(examples.train) / 32):
    encin = ret_model.encode(batch, train_mode=False).data.cpu().numpy()
    for vec in encin:
        new_vecs.append(vec)
    del encin

new_lsh = ret_model.make_lsh(new_vecs)

eval_num = 500
valid_eval = ret_model.ret_and_make_ex(examples.test[0:eval_num],
                                       new_lsh,
                                       examples.train,
                                       0,
                                       train_mode=False)
beam_list, edit_traces = edit_model.edit(valid_eval)
            voc_vec_rest[in_vocab_id[i]] = 0
            if in_vocab_id[i] == unk_idx:
                gold_rank = np.sum(voc_vec_rest >= voc_vec[copy_token_id[i]])
            else:
                gold_rank = np.sum(voc_vec_rest >= voc_vec[copy_token_id[i]] +
                                   voc_vec[in_vocab_id[i]])
            if target_mask[i] == 1.0:
                all_ranks_noret[i].append(gold_rank)
        position += 1
    del token_list
    del vocab_probs
    return all_ranks_noret


all_ranks_noret = []
for chunk in tqdm(chunks(examples.test[0:eval_num], 16), total=eval_num / 16):
    all_ranks_noret.extend(eval_batch_noret(chunk))

###
# base retriever.
import gtd.retrieval_func as rf
lsh, dict = rf.make_hash(examples.train)
output_index = rf.grab_nbs(examples.test[0:eval_num], lsh, dict)
ret_pred = rf.generate_predictions(examples.train, output_index)


def agree_vec(ref, targ):
    rank_vec = []
    for i in range(max(len(ref), len(targ))):
        if i < len(targ) and i < len(ref):
            agree_ind = ref[i] == targ[i]