def load_dataset(transition_system, dataset_file, reorder_predicates=True): examples = [] for idx, line in enumerate(open(dataset_file)): src_query, tgt_code = line.strip().split('\t') src_query_tokens = src_query.split(' ') lf = parse_lambda_expr(tgt_code) assert lf.to_string() == tgt_code if reorder_predicates: ordered_lf = get_canonical_order_of_logical_form( lf, order_by='alphabet') assert ordered_lf == lf lf = ordered_lf gold_source = lf.to_string() tgt_ast = logical_form_to_ast(grammar, lf) reconstructed_lf = ast_to_logical_form(tgt_ast) assert lf == reconstructed_lf tgt_actions = transition_system.get_actions(tgt_ast) print(idx) print('Utterance: %s' % src_query) print('Reference: %s' % tgt_code) # print('===== Actions =====') # sanity check hyp = Hypothesis() for action in tgt_actions: assert action.__class__ in transition_system.get_valid_continuation_types( hyp) if isinstance(action, ApplyRuleAction): assert action.production in transition_system.get_valid_continuating_productions( hyp) hyp = hyp.clone_and_apply_action(action) # print(action) assert hyp.frontier_node is None and hyp.frontier_field is None src_from_hyp = transition_system.ast_to_surface_code(hyp.tree) assert src_from_hyp == gold_source tgt_action_infos = get_action_infos(src_query_tokens, tgt_actions) # print(' '.join(src_query_tokens)) print('***') print(lf.to_string()) print() example = Example(idx=idx, src_sent=src_query_tokens, tgt_actions=tgt_action_infos, tgt_code=gold_source, tgt_ast=tgt_ast, meta=None) examples.append(example) return examples
def load_regex_dataset(transition_system, split): prefix = 'data/regex/' src_file = join(prefix, "src-{}.txt".format(split)) spec_file = join(prefix, "spec-{}.txt".format(split)) examples = [] for idx, (src_line, spec_line) in enumerate(zip(open(src_file), open(spec_file))): print(idx) src_line = src_line.rstrip() spec_line = spec_line.rstrip() src_toks = src_line.split() spec_toks = spec_line.rstrip().split() spec_ast = regex_expr_to_ast(transition_system.grammar, spec_toks) # sanity check reconstructed_expr = transition_system.ast_to_surface_code( spec_ast) print(spec_line, reconstructed_expr) assert spec_line == reconstructed_expr tgt_actions = transition_system.get_actions(spec_ast) # sanity check hyp = Hypothesis() for action in tgt_actions: assert action.__class__ in transition_system.get_valid_continuation_types( hyp) if isinstance(action, ApplyRuleAction): assert action.production in transition_system.get_valid_continuating_productions( hyp) hyp = hyp.clone_and_apply_action(action) assert hyp.frontier_node is None and hyp.frontier_field is None assert is_equal_ast(hyp.tree, spec_ast) expr_from_hyp = transition_system.ast_to_surface_code(hyp.tree) assert expr_from_hyp == spec_line tgt_action_infos = get_action_infos(src_toks, tgt_actions) example = Example(idx=idx, src_sent=src_toks, tgt_actions=tgt_action_infos, tgt_code=spec_line, tgt_ast=spec_ast, meta=None) examples.append(example) return examples
def load_dataset(transition_system, path, num, reorder_predicates=True): grammar = transition_system.grammar examples = [] pre_len = 0 if os.path.exists('data/pdf/train.bin'): examples = pickle.load(open('data/pdf/train.bin', 'rb')) pre_len = len(examples) idx = 0 for item in os.listdir(path): item_path = os.path.join(path, item) print(item) try: pdf = PdfReader(item_path) except: continue for page in pdf.pages: idx += 1 if idx <= pre_len: continue print(idx) try: tgt_ast = pdf_to_ast(grammar, page, []) except: continue tgt_actions = transition_system.get_actions(tgt_ast) """ hyp = Hypothesis() for action in tgt_actions: assert action.__class__ in transition_system.get_valid_continuation_types(hyp) if isinstance(action, ApplyRuleAction): assert action.production in transition_system.get_valid_continuating_productions(hyp) hyp = hyp.clone_and_apply_action(action) assert hyp.frontier_node is None and hyp.frontier_field is None """ tgt_action_infos = get_action_infos(tgt_actions) example = Example(idx=idx, tgt_actions=tgt_action_infos, meta=None) examples.append(example) if idx >= num: break return examples
def load_dataset(transition_system, dataset_file): examples = [] for idx, line in enumerate(open(dataset_file)): print(line) src_query, tgt_code = line.strip().split('~') tgt_code = tgt_code.replace("("," ( ") tgt_code = tgt_code.replace(")"," ) ") tgt_code = " ".join(tgt_code.split()) src_query = src_query.replace("(","") src_query = src_query.replace(")","") src_query_tokens = src_query.split(' ') tgt_ast = lisp_expr_to_ast(transition_system.grammar, tgt_code) reconstructed_lisp_expr = ast_to_lisp_expr(tgt_ast) assert tgt_code == reconstructed_lisp_expr tgt_actions = transition_system.get_actions(tgt_ast) # sanity check hyp = Hypothesis() for action in tgt_actions: assert action.__class__ in transition_system.get_valid_continuation_types(hyp) if isinstance(action, ApplyRuleAction): assert action.production in transition_system.get_valid_continuating_productions(hyp) hyp = hyp.clone_and_apply_action(action) assert hyp.frontier_node is None and hyp.frontier_field is None assert is_equal_ast(hyp.tree, tgt_ast) expr_from_hyp = transition_system.ast_to_surface_code(hyp.tree) assert expr_from_hyp == tgt_code tgt_action_infos = get_action_infos(src_query_tokens, tgt_actions) print(idx) example = Example(idx=idx, src_sent=src_query_tokens, tgt_actions=tgt_action_infos, tgt_code=tgt_code, tgt_ast=tgt_ast, meta=None) examples.append(example) return examples
def parse_django_dataset(annot_file, code_file, asdl_file_path, max_query_len=70, vocab_freq_cutoff=10): asdl_text = open(asdl_file_path).read() grammar = ASDLGrammar.from_text(asdl_text) transition_system = PythonTransitionSystem(grammar) loaded_examples = [] from components.vocab import Vocab, VocabEntry from components.dataset import Example for idx, (src_query, tgt_code) in enumerate(zip(open(annot_file), open(code_file))): src_query = src_query.strip() tgt_code = tgt_code.strip() src_query_tokens, tgt_canonical_code, str_map = Django.canonicalize_example( src_query, tgt_code) python_ast = ast.parse(tgt_canonical_code).body[0] gold_source = astor.to_source(python_ast).strip() tgt_ast = python_ast_to_asdl_ast(python_ast, grammar) tgt_actions = transition_system.get_actions(tgt_ast) # print('+' * 60) # print('Example: %d' % idx) # print('Source: %s' % ' '.join(src_query_tokens)) # if str_map: # print('Original String Map:') # for str_literal, str_repr in str_map.items(): # print('\t%s: %s' % (str_literal, str_repr)) # print('Code:\n%s' % gold_source) # print('Actions:') # sanity check try: hyp = Hypothesis() for t, action in enumerate(tgt_actions): # assert action.__class__ in transition_system.get_valid_continuation_types(hyp) # if isinstance(action, ApplyRuleAction): # assert action.production in transition_system.get_valid_continuating_productions(hyp) p_t = -1 f_t = None if hyp.frontier_node: p_t = hyp.frontier_node.created_time f_t = hyp.frontier_field.field.__repr__(plain=True) # print('\t[%d] %s, frontier field: %s, parent: %d' % (t, action, f_t, p_t)) hyp = hyp.clone_and_apply_action(action) assert hyp.frontier_node is None and hyp.frontier_field is None src_from_hyp = astor.to_source( asdl_ast_to_python_ast(hyp.tree, grammar)).strip() assert src_from_hyp == gold_source # print('+' * 60) except: continue loaded_examples.append({ 'src_query_tokens': src_query_tokens, 'tgt_canonical_code': gold_source, 'tgt_ast': tgt_ast, 'tgt_actions': tgt_actions, 'raw_code': tgt_code, 'str_map': str_map }) # print('first pass, processed %d' % idx, file=sys.stderr) train_examples = [] dev_examples = [] test_examples = [] action_len = [] for idx, e in enumerate(loaded_examples): src_query_tokens = e['src_query_tokens'][:max_query_len] tgt_actions = e['tgt_actions'] tgt_action_infos = get_action_infos(src_query_tokens, tgt_actions) example = Example(idx=idx, src_sent=src_query_tokens, tgt_actions=tgt_action_infos, tgt_code=e['tgt_canonical_code'], tgt_ast=e['tgt_ast'], meta={ 'raw_code': e['raw_code'], 'str_map': e['str_map'] }) # print('second pass, processed %d' % idx, file=sys.stderr) action_len.append(len(tgt_action_infos)) # train, valid, test split if 0 <= idx < 16000: train_examples.append(example) elif 16000 <= idx < 17000: dev_examples.append(example) else: test_examples.append(example) print('Max action len: %d' % max(action_len), file=sys.stderr) print('Avg action len: %d' % np.average(action_len), file=sys.stderr) print('Actions larger than 100: %d' % len(list(filter(lambda x: x > 100, action_len))), file=sys.stderr) src_vocab = VocabEntry.from_corpus( [e.src_sent for e in train_examples], size=5000, freq_cutoff=vocab_freq_cutoff) primitive_tokens = [ map( lambda a: a.action.token, filter(lambda a: isinstance(a.action, GenTokenAction), e.tgt_actions)) for e in train_examples ] primitive_vocab = VocabEntry.from_corpus(primitive_tokens, size=5000, freq_cutoff=vocab_freq_cutoff) assert '_STR:0_' in primitive_vocab # generate vocabulary for the code tokens! code_tokens = [ tokenize_code(e.tgt_code, mode='decoder') for e in train_examples ] code_vocab = VocabEntry.from_corpus(code_tokens, size=5000, freq_cutoff=vocab_freq_cutoff) vocab = Vocab(source=src_vocab, primitive=primitive_vocab, code=code_vocab) print('generated vocabulary %s' % repr(vocab), file=sys.stderr) return (train_examples, dev_examples, test_examples), vocab
def preprocess_dataset(file_path, transition_system, name='train', firstk=None): try: dataset = json.load(open(file_path)) except: dataset = [json.loads(jline) for jline in open(file_path).readlines()] if firstk: dataset = dataset[:firstk] examples = [] evaluator = ConalaEvaluator(transition_system) f = open(file_path + '.debug', 'w') skipped_list = [] for i, example_json in enumerate(dataset): try: example_dict = preprocess_example(example_json) python_ast = ast.parse(example_dict['canonical_snippet']) canonical_code = astor.to_source(python_ast).strip() tgt_ast = python_ast_to_asdl_ast(python_ast, transition_system.grammar) tgt_actions = transition_system.get_actions(tgt_ast) # sanity check hyp = Hypothesis() for t, action in enumerate(tgt_actions): assert action.__class__ in transition_system.get_valid_continuation_types( hyp) if isinstance(action, ApplyRuleAction): assert action.production in transition_system.get_valid_continuating_productions( hyp) # p_t = -1 # f_t = None # if hyp.frontier_node: # p_t = hyp.frontier_node.created_time # f_t = hyp.frontier_field.field.__repr__(plain=True) # # # print('\t[%d] %s, frontier field: %s, parent: %d' % (t, action, f_t, p_t)) hyp = hyp.clone_and_apply_action(action) assert hyp.frontier_node is None and hyp.frontier_field is None hyp.code = code_from_hyp = astor.to_source( asdl_ast_to_python_ast(hyp.tree, transition_system.grammar)).strip() # print(code_from_hyp) # print(canonical_code) assert code_from_hyp == canonical_code decanonicalized_code_from_hyp = decanonicalize_code( code_from_hyp, example_dict['slot_map']) assert compare_ast(ast.parse(example_json['snippet']), ast.parse(decanonicalized_code_from_hyp)) assert transition_system.compare_ast( transition_system.surface_code_to_ast( decanonicalized_code_from_hyp), transition_system.surface_code_to_ast(example_json['snippet'])) tgt_action_infos = get_action_infos(example_dict['intent_tokens'], tgt_actions) except (AssertionError, SyntaxError, ValueError, OverflowError) as e: skipped_list.append(example_json['question_id']) continue example = Example(idx=f'{i}-{example_json["question_id"]}', src_sent=example_dict['intent_tokens'], tgt_actions=tgt_action_infos, tgt_code=canonical_code, tgt_ast=tgt_ast, meta=dict(example_dict=example_json, slot_map=example_dict['slot_map'])) assert evaluator.is_hyp_correct(example, hyp) examples.append(example) # log! f.write(f'Example: {example.idx}\n') if 'rewritten_intent' in example.meta['example_dict']: f.write( f"Original Utterance: {example.meta['example_dict']['rewritten_intent']}\n" ) else: f.write( f"Original Utterance: {example.meta['example_dict']['intent']}\n" ) f.write( f"Original Snippet: {example.meta['example_dict']['snippet']}\n") f.write(f"\n") f.write(f"Utterance: {' '.join(example.src_sent)}\n") f.write(f"Snippet: {example.tgt_code}\n") f.write( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n" ) f.close() print('Skipped due to exceptions: %d' % len(skipped_list), file=sys.stderr) return examples
def load_regex_dataset(transition_system, split): prefix = 'data/streg/' src_file = join(prefix, "src-{}.txt".format(split)) spec_file = join(prefix, "targ-{}.txt".format(split)) map_file = join(prefix, "map-{}.txt".format(split)) exs_file = join(prefix, "exs-{}.txt".format(split)) rec_file = join(prefix, "rec-{}.pkl".format(split)) exs_info = StReg.load_examples(exs_file) map_info = StReg.load_map_file(map_file) rec_info = StReg.load_rec(rec_file) examples = [] for idx, (src_line, spec_line, str_exs, cmap, rec) in enumerate( zip(open(src_file), open(spec_file), exs_info, map_info, rec_info)): print(idx) src_line = src_line.rstrip() spec_line = spec_line.rstrip() src_toks = src_line.split() spec_toks = spec_line.rstrip().split() spec_ast = streg_expr_to_ast(transition_system.grammar, spec_toks) # sanity check reconstructed_expr = transition_system.ast_to_surface_code( spec_ast) # print("Spec", spec_line) # print("Rcon", reconstructed_expr) assert spec_line == reconstructed_expr tgt_actions = transition_system.get_actions(spec_ast) # sanity check hyp = Hypothesis() for action in tgt_actions: assert action.__class__ in transition_system.get_valid_continuation_types( hyp) if isinstance(action, ApplyRuleAction): assert action.production in transition_system.get_valid_continuating_productions( hyp) hyp = hyp.clone_and_apply_action(action) assert hyp.frontier_node is None and hyp.frontier_field is None assert is_equal_ast(hyp.tree, spec_ast) expr_from_hyp = transition_system.ast_to_surface_code(hyp.tree) assert expr_from_hyp == spec_line tgt_action_infos = get_action_infos(src_toks, tgt_actions) example = Example(idx=idx, src_sent=src_toks, tgt_actions=tgt_action_infos, tgt_code=spec_line, tgt_ast=spec_ast, meta={ "str_exs": str_exs, "const_map": cmap, "worker_info": rec }) examples.append(example) return examples
def preprocess_dataset(file_path, transition_system, name='train'): dataset = json.load(open(file_path)) examples = [] evaluator = ConalaEvaluator(transition_system) f = open(file_path + '.debug', 'w') for i, example_json in enumerate(dataset): example_dict = preprocess_example(example_json) if example_json['question_id'] in (18351951, 9497290, 19641579, 32283692): pprint(preprocess_example(example_json)) continue python_ast = ast.parse(example_dict['canonical_snippet']) canonical_code = astor.to_source(python_ast).strip() tgt_ast = python_ast_to_asdl_ast(python_ast, transition_system.grammar) tgt_actions = transition_system.get_actions(tgt_ast) # sanity check hyp = Hypothesis() for t, action in enumerate(tgt_actions): assert action.__class__ in transition_system.get_valid_continuation_types( hyp) if isinstance(action, ApplyRuleAction): assert action.production in transition_system.get_valid_continuating_productions( hyp) p_t = -1 f_t = None if hyp.frontier_node: p_t = hyp.frontier_node.created_time f_t = hyp.frontier_field.field.__repr__(plain=True) # print('\t[%d] %s, frontier field: %s, parent: %d' % (t, action, f_t, p_t)) hyp = hyp.clone_and_apply_action(action) assert hyp.frontier_node is None and hyp.frontier_field is None hyp.code = code_from_hyp = astor.to_source( asdl_ast_to_python_ast(hyp.tree, transition_system.grammar)).strip() assert code_from_hyp == canonical_code decanonicalized_code_from_hyp = decanonicalize_code( code_from_hyp, example_dict['slot_map']) assert compare_ast(ast.parse(example_json['snippet']), ast.parse(decanonicalized_code_from_hyp)) assert transition_system.compare_ast( transition_system.surface_code_to_ast( decanonicalized_code_from_hyp), transition_system.surface_code_to_ast(example_json['snippet'])) tgt_action_infos = get_action_infos(example_dict['intent_tokens'], tgt_actions) example = Example(idx=f'{i}-{example_json["question_id"]}', src_sent=example_dict['intent_tokens'], tgt_actions=tgt_action_infos, tgt_code=canonical_code, tgt_ast=tgt_ast, meta=dict(example_dict=example_json, slot_map=example_dict['slot_map'])) assert evaluator.is_hyp_correct(example, hyp) examples.append(example) # log! f.write(f'Example: {example.idx}\n') f.write( f"Original Utterance: {example.meta['example_dict']['rewritten_intent']}\n" ) f.write( f"Original Snippet: {example.meta['example_dict']['snippet']}\n") f.write(f"\n") f.write(f"Utterance: {' '.join(example.src_sent)}\n") f.write(f"Snippet: {example.tgt_code}\n") f.write( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n" ) f.close() return examples
def preprocess_dataset(file_path, transition_system, name='train', firstk=None): file_path = os.path.join(os.getcwd(), *file_path.split('/' if '/' in file_path else "\\")) try: dataset = json.load(open(file_path)) except: dataset = [json.loads(jline) for jline in open(file_path).readlines()] if firstk: dataset = dataset[:firstk] examples = [] evaluator = ConalaEvaluator(transition_system) # Author: Gabe # Added in encoding to try and deal with UnicodeEncodeErrors f = open(file_path + '.debug', 'w', encoding='utf-8') skipped_list = [] for i, example_json in tqdm(enumerate(dataset), file=sys.stdout, total=len(dataset), desc='Preproc'): # Author: Gabe # Have to skip this one question because it causes the program to hang and never recover. if example_json['question_id'] in [39525993]: skipped_list.append(example_json['question_id']) tqdm.write(f"Skipping {example_json['question_id']} because it causes errors") continue try: example_dict = preprocess_example(example_json) python_ast = ast.parse(example_dict['canonical_snippet']) canonical_code = astor.to_source(python_ast).strip() tgt_ast = python_ast_to_asdl_ast(python_ast, transition_system.grammar) tgt_actions = transition_system.get_actions(tgt_ast) # sanity check hyp = Hypothesis() for t, action in enumerate(tgt_actions): assert action.__class__ in transition_system.get_valid_continuation_types(hyp) if isinstance(action, ApplyRuleAction): assert action.production in \ transition_system.get_valid_continuating_productions( hyp) # p_t = -1 # f_t = None # if hyp.frontier_node: # p_t = hyp.frontier_node.created_time # f_t = hyp.frontier_field.field.__repr__(plain=True) # # # print('\t[%d] %s, frontier field: %s, parent: %d' % (t, action, f_t, p_t)) hyp = hyp.clone_and_apply_action(action) assert hyp.frontier_node is None and hyp.frontier_field is None hyp.code = code_from_hyp = astor.to_source( asdl_ast_to_python_ast(hyp.tree, transition_system.grammar)).strip() # print(code_from_hyp) # print(canonical_code) assert code_from_hyp == canonical_code decanonicalized_code_from_hyp = decanonicalize_code(code_from_hyp, example_dict['slot_map']) assert compare_ast(ast.parse(example_json['snippet']), ast.parse(decanonicalized_code_from_hyp)) assert transition_system.compare_ast( transition_system.surface_code_to_ast(decanonicalized_code_from_hyp), transition_system.surface_code_to_ast(example_json['snippet'])) tgt_action_infos = get_action_infos(example_dict['intent_tokens'], tgt_actions) except (AssertionError, SyntaxError, ValueError, OverflowError) as e: skipped_list.append(example_json['question_id']) tqdm.write( f"Skipping example {example_json['question_id']} because of {type(e).__name__}:{e}" ) continue example = Example(idx=f'{i}-{example_json["question_id"]}', src_sent=example_dict['intent_tokens'], tgt_actions=tgt_action_infos, tgt_code=canonical_code, tgt_ast=tgt_ast, meta=dict(example_dict=example_json, slot_map=example_dict['slot_map'])) assert evaluator.is_hyp_correct(example, hyp) examples.append(example) # Author: Gabe # Had to remove logging, when the log file would get too large, it would cause the # program to hang. # log! # f.write(f'Example: {example.idx}\n') # if 'rewritten_intent' in example.meta['example_dict']: # f.write(f"Original Utterance: {example.meta['example_dict']['rewritten_intent']}\n") # else: # f.write(f"Original Utterance: {example.meta['example_dict']['intent']}\n") # f.write(f"Original Snippet: {example.meta['example_dict']['snippet']}\n") # f.write(f"\n") # f.write(f"Utterance: {' '.join(example.src_sent)}\n") # f.write(f"Snippet: {example.tgt_code}\n") # f.write(f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") f.close() print('Skipped due to exceptions: %d' % len(skipped_list), file=sys.stderr) return examples
def parse_natural_dataset(asdl_file_path, max_query_len=70, vocab_freq_cutoff=10): asdl_text = open(asdl_file_path).read() print('building grammar') grammar = ASDLGrammar.from_text(asdl_text) transition_system = Python3TransitionSystem(grammar) loaded_examples = [] annotations = [] codes = [] path = os.path.join(os.path.dirname(__file__), "datagen") datagens = os.listdir(path) for folder in datagens: if "__" in folder or not os.path.isdir(os.path.join(path, folder)): continue with open(os.path.join(path, folder, "inputs.txt"), 'r') as file: annotations += file.read().split('\n') with open(os.path.join(path, folder, "outputs.txt"), 'r') as file: codes += file.read().split('\n') annotation_codes = list(zip(annotations, codes)) np.random.seed(42) np.random.shuffle(annotation_codes) from components.vocab import Vocab, VocabEntry from components.dataset import Example print('processing examples') for idx, (src_query, tgt_code) in enumerate(annotation_codes): if (idx % 100 == 0): sys.stdout.write("\r%s / %s" % (idx, len(annotation_codes))) sys.stdout.flush() src_query = src_query.strip() tgt_code = tgt_code.strip() src_query_tokens, tgt_canonical_code, str_map = Natural.canonicalize_example( src_query, tgt_code) python_ast = ast.parse(tgt_canonical_code) #.body[0] gold_source = astor.to_source(python_ast).strip() tgt_ast = python_ast_to_asdl_ast(python_ast, transition_system.grammar) tgt_actions = transition_system.get_actions(tgt_ast) # print('+' * 60) # print('Example: %d' % idx) # print('Source: %s' % ' '.join(src_query_tokens)) # if str_map: # print('Original String Map:') # for str_literal, str_repr in str_map.items(): # print('\t%s: %s' % (str_literal, str_repr)) # print('Code:\n%s' % gold_source) # print('Actions:') # sanity check hyp = Hypothesis() for t, action in enumerate(tgt_actions): assert action.__class__ in transition_system.get_valid_continuation_types( hyp) if isinstance(action, ApplyRuleAction): assert action.production in transition_system.get_valid_continuating_productions( hyp) # assert action.__class__ in transition_system.get_valid_continuation_types( # hyp) p_t = -1 f_t = None if hyp.frontier_node: p_t = hyp.frontier_node.created_time f_t = hyp.frontier_field.field.__repr__(plain=True) # print('\t[%d] %s, frontier field: %s, parent: %d' % # (t, action, f_t, p_t)) hyp = hyp.clone_and_apply_action(action) # assert hyp.frontier_node is None and hyp.frontier_field is None src_from_hyp = astor.to_source( asdl_ast_to_python_ast(hyp.tree, grammar)).strip() if "b'" not in str(gold_source) and 'b"' not in str(gold_source): assert src_from_hyp == gold_source # print('+' * 60) loaded_examples.append({ 'src_query_tokens': src_query_tokens, 'tgt_canonical_code': gold_source, 'tgt_ast': tgt_ast, 'tgt_actions': tgt_actions, 'raw_code': tgt_code, 'str_map': str_map }) # print('first pass, processed %d' % idx, file=sys.stderr) train_examples = [] dev_examples = [] test_examples = [] action_len = [] print("\nsplitting train/dev/test") for idx, e in enumerate(loaded_examples): src_query_tokens = e['src_query_tokens'][:max_query_len] tgt_actions = e['tgt_actions'] tgt_action_infos = get_action_infos(src_query_tokens, tgt_actions) example = Example(idx=idx, src_sent=src_query_tokens, tgt_actions=tgt_action_infos, tgt_code=e['tgt_canonical_code'], tgt_ast=e['tgt_ast'], meta={ 'raw_code': e['raw_code'], 'str_map': e['str_map'] }) # print('second pass, processed %d' % idx, file=sys.stderr) action_len.append(len(tgt_action_infos)) # train, valid, test split total_examples = len(loaded_examples) split_size = np.ceil(total_examples * 0.05) (dev_split, test_split) = (total_examples - split_size * 2, total_examples - split_size) if 0 <= idx < dev_split: train_examples.append(example) elif dev_split <= idx < test_split: dev_examples.append(example) else: test_examples.append(example) print('Max action len: %d' % max(action_len), file=sys.stderr) print('Avg action len: %d' % np.average(action_len), file=sys.stderr) print('Actions larger than 100: %d' % len(list(filter(lambda x: x > 100, action_len))), file=sys.stderr) src_vocab = VocabEntry.from_corpus( [e.src_sent for e in train_examples], size=5000, freq_cutoff=vocab_freq_cutoff) primitive_tokens = [ map( lambda a: a.action.token, filter(lambda a: isinstance(a.action, GenTokenAction), e.tgt_actions)) for e in train_examples ] primitive_vocab = VocabEntry.from_corpus(primitive_tokens, size=5000, freq_cutoff=vocab_freq_cutoff) # assert '_STR:0_' in primitive_vocab # generate vocabulary for the code tokens! code_tokens = [ tokenize_code(e.tgt_code, mode='decoder') for e in train_examples ] code_vocab = VocabEntry.from_corpus(code_tokens, size=5000, freq_cutoff=vocab_freq_cutoff) vocab = Vocab(source=src_vocab, primitive=primitive_vocab, code=code_vocab) print('generated vocabulary %s' % repr(vocab), file=sys.stderr) return (train_examples, dev_examples, test_examples), vocab