def train_step(self, cases, weights, caching): if len(cases) != len(weights): raise ValueError('cases and weights must have the same length.') if len(cases) == 0: #logging.warn('Training on zero cases.') print >> sys.stderr, " WARNING: Zero cases \033[F" # still increment the step sess = tf.get_default_session() sess.run(self._increment_step) elif not self._max_batch_size or len(cases) <= self._max_batch_size: print >> sys.stderr, " Updating ({} cases) \033[F".format(len(cases)) self.compute(self._take_step, cases, weights, caching) else: print >> sys.stderr, " Updating ({} cases) \033[F".format(len(cases)) assert not caching grads = None slices = range(0, len(cases), self._max_batch_size) for i in verboserate(slices, desc='Computing gradients ({} cases)'.format(len(cases))): cases_slice = cases[i:i + self._max_batch_size] weights_slice = weights[i:i + self._max_batch_size] grads_slice = self.compute(self._grad_tensors, cases_slice, weights_slice, False) if grads is None: grads = grads_slice else: for i in xrange(len(self._grad_tensors)): grads[i] += grads_slice[i] sess = tf.get_default_session() feed_dict = dict(zip(self._combined_grad_placeholders, grads)) sess.run(self._apply_gradients, feed_dict) sess.run(self._increment_step)
def __init__(self, vocab_size=400000): """Load GloveEmbeddings. Args: word_vocab_size (int): max # of words in the vocab. If not specified, uses all available GloVe vectors. Returns: (np.array, SemgenVocab) """ embed_dim = 100 if vocab_size < 5000: raise ValueError('Need to at least use 5000 words.') glove_path = join(DataDirectory.glove, 'glove.6B.100d.txt') download_path = 'http://nlp.stanford.edu/data/glove.6B.zip' if not os.path.exists(glove_path): raise RuntimeError('Missing file: {}. Download it here: {}'.format(glove_path, download_path)) # embeddings for special words words = list(UtteranceVocab.SPECIAL_TOKENS) num_special = len(words) embeds = [np.zeros(embed_dim, dtype=np.float32) for _ in words] # zeros are just placeholders for now with open(glove_path, 'r') as f: lines = verboserate(f, desc='Loading GloVe embeddings', total=vocab_size, initial=num_special) for i, line in enumerate(lines, start=num_special): if i == vocab_size: break tokens = line.split() word, embed = tokens[0], np.array([float(tok) for tok in tokens[1:]]) words.append(word) embeds.append(embed) vocab = UtteranceVocab(words) embed_matrix = np.stack(embeds) special_embeds = emulate_distribution((num_special, embed_dim), embed_matrix[:5000, :], seed=2) embed_matrix[:num_special, :] = special_embeds assert embed_matrix.shape[1] == 100 super(GloveEmbeddings, self).__init__(embed_matrix, vocab)
def _get_all_hits(get_page): """Given a function that retrieves a single page of HITs, retrieve all HITs. WARNING: - this function can be quite slow. - results are returned in no particular order. Args: get_page (Callable[[int, int], list[HIT]]): a function which takes a page size and page number. and returns a list of HITs. kwargs: page_size (int) page_number (int) Returns: generator[HIT] """ page_size = 100 # HITs per page # compute the pages that need to be fetched search_results = get_page(page_size=page_size, page_number=1) total_hits = int(search_results.TotalNumResults) total_pages = total_hits / page_size + bool(total_hits % page_size) page_nums = list(range(1, total_pages + 1)) # fetch all the pages in parallel fetch_page = lambda i: get_page(page_size=page_size, page_number=i) with SimpleExecutor(fetch_page) as executor: for i in page_nums: executor.submit(i, i) for i, page in verboserate(executor.results(), desc='Fetching pages of HITs', total=total_pages): if isinstance(page, Failure): print page.traceback continue for hit in page: yield hit
def train_edit(self, use_lsh, topk): # TODO(kelvin): do something to preserve random state upon reload? train_state = self.train_state examples = self._examples config = self.config workspace = self.workspace vae_editor = train_state.model.vae_model ret_model = train_state.model.ret_model edit_model = train_state.model.edit_model # Set up static editor training step = train_state.train_steps while step < 3 * config.optim.max_iters: train_eval = ret_model.ret_and_make_ex(examples.train, use_lsh, examples.train, 1) valid_eval = ret_model.ret_and_make_ex(examples.valid, use_lsh, examples.train, 0) ret_batches = similar_size_batches(train_eval, config.optim.batch_size) # random.shuffle(train_batches) random.shuffle(ret_batches) for batch in verboserate(ret_batches, desc='Streaming training for retrieval'): # Set up pairs to edit on fict_batch = edit_model.ident_mapper(batch, config.model.ident_pr) edit_loss, _, _ = edit_model.loss(fict_batch) loss = edit_loss finite_grads, grad_norm = self._take_grad_step( train_state, loss) self.check_gradnan(finite_grads, train_state, workspace) step = train_state.train_steps self.eval_and_save(edit_model, step, train_state, config, grad_norm, train_eval, valid_eval) if step >= 3 * config.optim.max_iters: break pass
def _train(cls, config, train_state, examples): model = train_state.model optimizer = train_state.optimizer train_batches = similar_size_batches( examples.train, config.optim.batch_size, size=lambda ex: len(ex)) while True: random.shuffle(train_batches) i = 0 # cannot enumerate(verboserate(...)) for batch in verboserate(train_batches, desc='Streaming training examples'): loss = model.loss(batch, cls._train_state.train_steps) cls._take_grad_step(train_state, loss) if (i % 100) == 0: cls.evaluate() if (i % 1000) == 0: if config.model.type == 1: # SVAE # write interpolations to file fname = "interps_batches_{}".format(i) num_ex = 10 a_idx = np.random.randint(len(batch), size=num_ex) b_idx = np.random.randint(len(batch), size=num_ex) interps = [] for a, b in zip(a_idx, b_idx): ex_a = batch[a] ex_b = batch[b] interpolation = model._interpolate_examples(ex_a, ex_b) interpolation_repr = [] interpolation_repr.append(" ".join(ex_a)) interpolation_repr.extend( [" ".join(ex) for ex in interpolation]) interpolation_repr.append(" ".join(ex_b)) interps.append(interpolation_repr) with open(join(cls._interps_dir, fname), 'w') as fout: data = "\n\n".join(["\n".join(ex) for ex in interps]) fout.write(data.encode('utf-8')) if (i % 5000) == 0: cls.checkpoints.save(train_state) i += 1
def examples_from_file(path): """Return list[EditExample] from file path.""" examples = [] # count total lines before loading total_lines = int(local('wc -l {}'.format(path), capture=True).split()[0]) print("----> TOTAL LINES", total_lines) with codecs.open(path, 'r', encoding='utf-8') as f: for line in verboserate(f, desc='Reading data file.', total=total_lines): print("BEZIGMET", line) src, trg = line.strip().lower().split('\t') src_words = src.split(' ') trg_words = trg.split(' ') assert len(src_words) > 0 assert len(trg_words) > 0 if use_diff: ex = EditExample.salient_diff(src_words, trg_words, free_set) else: ex = EditExample.whitelist_blacklist(src_words, trg_words) examples.append(ex) return examples
def view(self, select=lambda path: True): """View runs. Args: select (Callable[str, bool]): given a path to a run, returns True if we want to display the run, False otherwise. """ field_names = list(self._renderers.keys()) table = PrettyTable(field_names=field_names) types = OrderedDict((n, set()) for n in field_names) for i, path in verboserate(list(self._runs._int_dirs.items()), desc='Scanning runs.'): if not select(path): continue row = [] for render in list(self._renderers.values()): try: s = render(path) except: s = '' row.append(s) # record types for name, elem in zip(field_names, row): types[name].add(type(elem)) table.add_row(row) self._print_table(table) # display types for each attribute type_table = PrettyTable(['attribute', 'types']) for name, type_set in types.items(): type_table.add_row([name, ', '.join(t.__name__ for t in type_set)]) self._print_table(type_table)
def edit(self, examples, max_seq_length=150, beam_size=5, batch_size=64, constrain_vocab=False, verbose=False): """Performs edits on a batch of source sentences. Args: examples (list[EditExample]) max_seq_length (int): max # timesteps to generate for beam_size (int): for beam decoding batch_size (int): max number of examples to pass into the RNN decoder at a time. The total # examples decoded in parallel = batch_size / beam_size. constrain_vocab (bool): default is False Returns: beam_list (list[list[list[unicode]]]): a batch of beams. edit_traces (list[EditTrace]) """ self.eval() # set to evaluation mode, for dropout to work correctly beam_list = [] edit_traces = [] batches = chunks(examples, batch_size / beam_size) batches = verboserate(batches, desc='Decoding examples') if verbose else batches for batch in batches: beams, traces = self._edit_batch(batch, max_seq_length, beam_size, constrain_vocab) beam_list.extend(beams) edit_traces.extend(traces) self.train() # set back to train mode return beam_list, edit_traces
def _train(cls, config, train_state, examples, workspace, metadata, tb_logger): """Train a model. NOTE: modifies TrainState in place. - parameters of the Editor and Optimizer are updated - train_steps is updated - random number generator states are updated at every checkpoint Args: config (Config) train_state (TrainState): initial TrainState. Includes the Editor and Optimizer. examples (EditDataSplits) workspace (Workspace) metadata (Metadata) tb_logger (tensorboard_logger.Logger) """ with random_state(train_state.random_state): editor = train_state.editor optimizer = train_state.optimizer noiser = EditNoiser(config.editor.ident_pr, config.editor.attend_pr) train_batches = similar_size_batches(examples.train, config.optim.batch_size) # test batching! # commenting out for now, not certain why there is a batching error. #editor.test_batch(noiser(train_batches[0])) while True: # TODO(kelvin): this shuffle and the position within the shuffle is not properly restored upon reload random.shuffle(train_batches) for batch in verboserate(train_batches, desc='Streaming training examples'): # compute gradients optimizer.zero_grad() if config.editor.edit_dropout: noised_batch = noiser(batch) else: noised_batch = batch #loss = editor.loss(noised_batch, draw_samples=config.editor.enable_vae) var_loss, var_params, var_param_grads = editor.loss(noised_batch, draw_samples=config.editor.enable_vae) #reg_loss.backward() #loss.backward() """ # clip gradients if train_state.train_steps < 50: # don't clip, just observe the gradient norm grad_norm = clip_grad_norm(editor.parameters(), float('inf'), norm_type=2) train_state.track_grad_norms(grad_norm) metadata['max_grad_norm'] = train_state.max_grad_norm else: # clip according to the max allowed grad norm grad_norm = clip_grad_norm(editor.parameters(), train_state.max_grad_norm) # this returns the gradient norm BEFORE clipping """ # Always do gradient clipping # To-do: make this tunable, not hard-coded grad_norm = clip_grad_norm(editor.parameters(), 5.) #storch.nn.utils.clip_grad_norm(editor.parameters(), 5.0) finite_grads = cls._finite_grads(editor.parameters()) #cur = [param for param in editor.parameters()] # take a step if the grads are finite if finite_grads: optimizer.step() # increment step count train_state.increment_train_steps() # somehow we encountered NaN if not finite_grads: # dump parameters train_state.save(workspace.nan_checkpoints) # dump offending example batch examples_path = join(workspace.nan_checkpoints, '{}.examples'.format(train_state.train_steps)) with open(examples_path, 'w') as f: pickle.dump(noised_batch, f) print 'Gradient was NaN/inf on step {}.'.format(train_state.train_steps) # if there were more than 5 NaNs in the last 10 steps, drop into the debugger nan_steps = cls._checkpoint_numbers(workspace.nan_checkpoints) recent_nans = [s for s in nan_steps if s > train_state.train_steps - 10] if len(recent_nans) > 5: print 'Too many NaNs encountered recently: {}. Entering debugger.'.format(recent_nans) import pdb pdb.set_trace() # run periodic evaluation and saving if train_state.train_steps % config.eval.eval_steps == 0: cls._evaluate(config, editor, examples, metadata, tb_logger, train_state.train_steps, noiser, big_eval=False) tb_logger.log_value('grad_norm', grad_norm, train_state.train_steps) if train_state.train_steps % config.eval.big_eval_steps == 0: cls._evaluate(config, editor, examples, metadata, tb_logger, train_state.train_steps, noiser, big_eval=True) if train_state.train_steps % config.eval.save_steps == 0: train_state.update_random_state() train_state.save(workspace.checkpoints) if train_state.train_steps >= config.optim.max_iters: return
from git import Repo from os.path import join import sys print sys.path from gtd.git_utils import commit_diff from gtd.chrono import verboserate repo_path = sys.argv[1] max_count = sys.argv[2] files = set(sys.argv[3:]) def format_commit(c): msg = c.message.split('\n')[0] return '{}\t{}'.format(c.hexsha, msg) repo = Repo(repo_path) commits = list(repo.iter_commits('master', max_count=max_count)) lines = [] for c in verboserate(commits, desc='Scanning commits', total=max_count): if len(files & commit_diff(c)) == 0: continue lines.append(format_commit(c)) log_path = join(repo_path, 'git-logs.tsv') with open(log_path, 'w') as f: for line in lines: f.write(line) f.write('\n')
def _compute_metrics(cls, editor, examples, num_evaluate_examples, batch_size): """ Args: editor (Editor) examples (list[EditExample]) num_evaluate_examples (int) batch_size (int) Returns: stats (dict[str, float]) edit_traces (list[EditTrace]): of length num_evaluate_examples loss_traces (list[LossTrace]): of length num_evaluate_examples """ sample = sample_if_large(examples, num_evaluate_examples, replace=False) # compute loss # need to break the sample into batches, in case the sample is too large to fit in GPU memory losses, loss_traces, weights, enc_losses = [], [], [], [] for batch in verboserate(chunks(sample, batch_size), desc='Computing loss on examples'): weights.append(len(batch)) loss_var, loss_trace_batch, enc_loss = editor.loss(batch) # convert loss Variable into float loss_val = loss_var.data[0] assert isinstance(loss_val, float) losses.append(loss_val) enc_losses.append(enc_loss) loss_traces.extend(loss_trace_batch) losses, weights = np.array(losses), np.array(weights) loss = np.sum(losses * weights) / np.sum(weights) # weighted average enc_loss = np.sum(np.array(enc_losses) * weights) / np.sum(weights) punct_table = dict.fromkeys( i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P')) def remove_punct(s): new_s = [] for t in s: t = unicode(t).translate(punct_table) if t != '': new_s.append(t) return new_s metrics = { 'bleu': (bleu, max), 'edit_dist': (lambda s, t: edit_dist(s, t)[0] / len(s) if len(s) > 0 else len(t), min), 'exact_match': (lambda s, t: 1.0 if remove_punct(s) == remove_punct(t) else 0.0, max) } top_results = defaultdict(list) top5_results = defaultdict(list) # compute predictions beams, edit_traces = editor.edit(sample, batch_size=batch_size, max_seq_length=150, verbose=True) for ex, beam in izip(sample, beams): top = beam[0] top5 = beam[:5] target = ex.target_words for name, (fxn, best) in metrics.items(): top_results[name].append(fxn(top, target)) top5_results[name].append( best(fxn(predict, target) for predict in top5)) # compute averages stats_top = {name: np.mean(vals) for name, vals in top_results.items()} stats_top5 = { '{}_top5'.format(name): np.mean(vals) for name, vals in top5_results.items() } # combine into a single stats object stats = {'loss': loss, 'enc_loss': enc_loss} stats.update(stats_top) stats.update(stats_top5) return stats, edit_traces, loss_traces
def examples_from_file(data_paths, seq_length_limit, fname): examples = {} MAX_LINE_LENGTH = 128 name = '{}.pickle'.format(fname) file = pathlib2.Path.cwd( ) / 'github_data' / 'processed_repo_pkl' / name # if os.path.exists(str(file)): # with open(str(file), 'rb') as f: # examples = pickle.load(f) # f.close() # return list(examples.values()) # count total lines before loading num_direct = len(data_paths) for line in verboserate(data_paths, desc='Reading data file.', total=num_direct): df = pd.read_csv(line, skiprows=2, header=None, names=[0, 1], dtype=str).fillna(NO_CONTEXT_WORD) df[0] = df[0].apply(lambda x: tokenize_fine_grained(x)) # df[0] = df[0].apply(lambda x: preprocess_tokens(x, MAX_LINE_LENGTH)) df[1] = df[1].apply(lambda x: tokenize_fine_grained(x)) try: ex = [] for i, row in df.iterrows(): try: ex.append(EditExample(row[0], row[1])) except: # print 'bad formatting in file ' + str(line).split('/')[-1] # print line count = 1 # skip sequences that are too long, because they use up memory # if max_seq_length(ex) > seq_length_limit: # continue ex = list( ifilterfalse( lambda x: max_seq_length(x) > seq_length_limit, ex)) # examples[(str(line).split('/')[-1], len(ex))] = ex file = pathlib2.Path.cwd( ) / 'github_data' / 'processed_repo_pkl' / fname result = {(str(line).split('/')[-1], len(ex)): ex} k = str(line).split('/')[-1].split('.')[0] pick_obj = {(str(line).split('/')[-1], len(ex)): ex} obj_name = str(file / k) + '.pickle' with open(obj_name, 'wb') as f: pickle.dump(pick_obj, f) f.close() except Exception as e: print e print 'bad formatting in file ' + str(line).split('/')[-1] print line # name = '{}.pickle'.format(fname) # file = pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl' / name # if fname == 'train': # file = pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl' / fname # for k, v in tqdm(examples.items()): # obj_name = file / k[0].split('.')[0] # pick_obj = {k : v} # with open(str(obj_name), 'wb') as f: # pickle.dump(pick_obj, f) # f.close() # else: # if not os.path.exists(str(file)): # with open(str(file), 'wb') as f: # pickle.dump(examples, f) # f.close() return list(examples.values())
def examples_to_supervised_cases(examples): """Return a Generator of supervised ParseCases.""" for example in verboserate(examples, desc='Streaming supervised ParseCases'): for case in example_to_supervised_cases(example): yield case
def batch_generator(): while True: # WARNING: random state of train state does not exactly restore state anymore, due to this shuffle random.shuffle(train_batches) for batch in verboserate(train_batches, desc='Streaming example batches'): yield batch
from os.path import join import sys print(sys.path) from gtd.git_utils import commit_diff from gtd.chrono import verboserate repo_path = sys.argv[1] max_count = sys.argv[2] files = set(sys.argv[3:]) def format_commit(c): msg = c.message.split('\n')[0] return '{}\t{}'.format(c.hexsha, msg) repo = Repo(repo_path) commits = list(repo.iter_commits('master', max_count=max_count)) lines = [] for c in verboserate(commits, desc='Scanning commits', total=max_count): if len(files & commit_diff(c)) == 0: continue lines.append(format_commit(c)) log_path = join(repo_path, 'git-logs.tsv') with open(log_path, 'w') as f: for line in lines: f.write(line) f.write('\n')
def decode(self, examples, encoder_output, weighted_value_estimators, beam_size, prefix_hints, sibling_penalty, max_seq_length=50, top_k=5, verbose=False): """Beam decode. Args: examples (list[Example]) encoder_output (EncoderOutput) weighted_value_estimators (list[(ValueEstimator, float)]): a list of (estimator, weight) pairs. beam_size (int) prefix_hints (list[list[unicode]]): a batch of prefixes. For each example, all returned results will start with the specified prefix. sibling_penalty (float) max_seq_length (int): maximum allowable length of outputted sequences top_k (int): number of beam candidates to show in trace verbose (bool): default is False Returns: beams (list[list[list[unicode]]]): a batch of beams of decoded sequences traces (list[BeamDecoderTrace]) """ rnn_state_orig, states_orig = self._initialize(self.decoder_cell, examples) # duplicate everything to beam_size duplicate = BeamDuplicator(beam_size) rnn_state = duplicate(rnn_state_orig) encoder_output = duplicate(encoder_output) states = [] for state in states_orig: states.append(state) # these states are guaranteed to die on the first round, because their sequence_prob = 0 # they are just here as padding # TODO(kelvin): WARNING! In the future, the ValueEstimators in BeamDecoder._advance might break # my assumption that any extension of a sequence with 0 prob will also have 0 prob. # If this assumption is broken, the BeamDecoder will return a beam of identical results. doomed = [DecoderState.initial_doomed(state.example) ] * (beam_size - 1) states.extend(doomed) # perform iterations of beam search time_steps = range(max_seq_length) if verbose: time_steps = verboserate(time_steps, desc='Beam decoding sequences') states_over_time = [] for _ in time_steps: # stop if all sequences have terminated if all(state.terminated for state in states): break rnn_state, states = self._advance(encoder_output, weighted_value_estimators, beam_size, rnn_state, states, sibling_penalty) states_over_time.append(states) return self._recover_sequences(states_over_time, beam_size, top_k)