def run_predict(dataset, inference, do_execute, inference_output_path, evaluate_on_all=False): """Runs inference of given model on eval set, and executes resulting code. Args: tag: str, tag of the run to save report. dataset: Dataset, iterable of CodeExample to evaluate on. inference: func, produces code for given CodeExamples. do_execute: func, runs given code with given arguments. show_info: Show specific example additional information. """ assert inference_output_path is not None, "must provide path" assert not os.path.exists( inference_output_path), "must be a path that doesn't exist" assert os.path.isdir( os.path.dirname(inference_output_path)), "parent folder must exist" predictions = [] success = total = 0 pdataset = tqdm.tqdm(dataset) for batch in pdataset: results = inference(batch) for res, example in zip(results, batch.orig_examples): tests = [] if evaluate_on_all: tests += list(example.input_tests) tests += list(example.tests) stats = executor.evaluate_code(res.code_sequence, example.schema.args, tests, do_execute) prediction = dict( output=res.info['candidates'][0], beams=res.info['candidates'], beams_correct=[ executor.evaluate_code(hypothesis, example.schema.args, tests, do_execute) for hypothesis in res.info['candidates'] ], is_correct=stats['correct'] == stats['total'], individual=stats['individual'], guid=example.guid, ) if evaluate_on_all: prediction['passes_given_tests'] = all( stats['individual'][:len(example.input_tests)]) predictions.append(prediction) success += stats['correct'] == stats['total'] total += 1 pdataset.set_description("Accuracy: {:.2f}%".format(success / total * 100)) with open(inference_output_path, "w") as f: json.dump(predictions, f)
def _try_sequences(self, vocab, sequences, batch, beam_size): result = [[] for _ in range(len(batch))] counters = [0 for _ in range(len(batch))] candidates = [[] for _ in range(len(batch))] max_eval_trials = self.args.max_eval_trials or beam_size for batch_id, outputs in enumerate(sequences): example = batch[batch_id] #print("===", example.code_tree) candidates[batch_id] = [[vocab.itos(idx) for idx in ids] for ids in outputs] for code in candidates[batch_id][:max_eval_trials]: counters[batch_id] += 1 stats = executor.evaluate_code(code, example.schema.args, example.input_tests, self.executor.execute) ok = (stats['correct'] == stats['total']) #print(code, stats) if ok: result[batch_id] = code break return [ InferenceResult(code_sequence=seq, info={ 'trees_checked': c, 'candidates': cand }) for seq, c, cand in zip(result, counters, candidates) ]
def _try_sequences(self, vocab, sequences, input_grids, output_grids, beam_size): result = [[] for _ in range(len(sequences))] counters = [0 for _ in range(len(sequences))] candidates = [[] for _ in range(len(sequences))] max_eval_trials = self.args.max_eval_trials or beam_size for batch_id, outputs in enumerate(sequences): input_tests = [{ 'input': np.where(inp.numpy().ravel())[0].tolist(), 'output': np.where(out.numpy().ravel())[0].tolist(), } for inp, out in zip( torch.split(input_grids[batch_id].data.cpu(), 1), torch.split(output_grids[batch_id].data.cpu(), 1), )] candidates[batch_id] = [[vocab.itos(idx) for idx in ids] for ids in outputs] for code in candidates[batch_id][:max_eval_trials]: counters[batch_id] += 1 stats = executor.evaluate_code(code, None, input_tests, self.executor.execute) ok = (stats['correct'] == stats['total']) if ok: result[batch_id] = code break return [ InferenceResult(code_sequence=seq, info={ 'trees_checked': c, 'candidates': cand }) for seq, c, cand in zip(result, counters, candidates) ]
def calculate_policy_gradient_loss(self, input_grids, io_embed, orig_examples, ref_code, ref_code_memory, ref_trace_memory): init_state = self.model.decoder.init_state(ref_code_memory, ref_trace_memory, io_embed.shape[0], io_embed.shape[1]) memory = self.model.decoder.prepare_memory(io_embed, ref_code_memory, ref_trace_memory, ref_code) sequences = beam_search.beam_search( len(input_grids), init_state, memory, self.model.decode_token, self.args.max_beam_trees, cuda=self.args.cuda, max_decoder_length=self.args.max_decoder_length, return_beam_search_result=True, volatile=False, differentiable=True, use_length_penalty=self.args.use_length_penalty, factor=self.args.length_penalty_factor) output_code = self.model.decoder.postprocess_output( [[x.sequence for x in y] for y in sequences], memory) all_logits = [] rewards = [] for logit_beam, code_beam, example in zip(sequences, output_code, orig_examples): for i, (logits, code) in enumerate(zip(logit_beam, code_beam)): code = list(map(self.vocab.itos, code)) all_logits.append( torch.sum( torch.cat([x.view(1) for x in logits.log_probs_torch]))) run_cases = lambda tests: executor.evaluate_code( code, example.schema.args, tests, self.executor.execute) input_tests = run_cases(example.input_tests) reward = input_tests['correct'] / input_tests['total'] if self.args.use_held_out_test_for_rl: held_out_test = run_cases(example.tests) reward += held_out_test[ 'correct'] # worth as much as all the other ones combined rewards.append(reward) all_logits = torch.cat([x.view(1) for x in all_logits]) print(np.mean(rewards)) rewards = torch.tensor(rewards) if not self.args.no_baseline: rewards = rewards - np.mean(rewards) if all_logits.is_cuda: rewards = rewards.cuda() return -(rewards * all_logits).mean()
def run_eval(tag, dataset, inference, do_execute, show_info=True, report_path=None, limit=None, evaluate_on_all=False): """Runs inference of given model on eval set, and executes resulting code. Args: tag: str, tag of the run to save report. dataset: Dataset, iterable of CodeExample to evaluate on. inference: func, produces code for given CodeExamples. do_execute: func, runs given code with given arguments. show_info: Show specific example additional information. """ report = EvalReport(tag=tag, show_info=show_info, report_path=report_path) done = False try: for batch in limited(dataset, limit): start = time.time() results = inference(batch) for res, example in zip(results, batch.orig_examples): tests = [] if evaluate_on_all: tests += list(example.input_tests) tests += list(example.tests) stats = executor.evaluate_code( res.code_tree if res.code_tree else res.code_sequence, example.schema.args, tests, do_execute) report.add_example(example, res, stats) print("[Eval] Elapsed time for %d examples: %f" % (len(batch.orig_examples), time.time() - start)) report.display() done = True finally: print("Stopped.") report.save(done) report.display()
def test_results(self, code, example): return evaluate_code(code, example.schema.args, example.input_tests, self.executor.execute)