def parse_gen(code, grammar): tokens = filter_space(tokenize(round_trip(code), new_line=False)) result = OrderedDict() result['code'] = code result['start_w_close_curly'] = start_w_close_curly(tokens) result['end_w_open_curly'] = end_w_open_curly(tokens) cur_idx = 0 if tokens[-1].value == ';': tokens = tokens[:-1] for atom in grammar: if cur_idx >= len(tokens): next_idx, content = None, None else: next_idx, content = atom.f(cur_idx, tokens) if next_idx is None and not atom.optional: raise parse_single_line_exception( 'Error parsing atom %s, which is required' % atom.name) result[atom.name] = content if next_idx is not None: cur_idx = next_idx if 'stmt' in result: result['new_scope'] = result['stmt'] is None else: result['new_scope'] = False return result
def pseudo_compile_check(code, indent, search_opt): code_by_line = code.strip().split('\n') program_length = len(code_by_line) gold_sents_l, gold_scores_l = [[round_trip(c)] for c in code_by_line ], [[0] for _ in range(program_length)] gold_groups = search_structured_groups(gold_sents_l, gold_scores_l, search_opt, indent)['groups'] return gold_groups is not None
def return_result_for_line_marker(code): tokens = filter_space(tokenize(round_trip(code), new_line=False)) result = OrderedDict() result['code'] = code result['start_w_close_curly'] = start_w_close_curly(tokens) result['end_w_open_curly'] = end_w_open_curly(tokens) result['new_scope'] = result['end_w_open_curly'] result['line_type'] = 'marker' return result
def get_cores(line_code): line_code = round_trip(line_code) try: tokens = tokenize(line_code) result = parse_func_header(tokens) return [] except: result = decompose_line(line_code) cores = [] for key in result: if 'stmt' in key: stmt, depth = result[key] stmt = tokenize(stmt) segment_info = parse_chunk(stmt) cores += segment_info['nodes'] return cores
def return_result_for_trivial_code(code): tokens = filter_space(tokenize(round_trip(code), new_line=False)) result = OrderedDict() result['code'] = code result['start_w_close_curly'] = start_w_close_curly(tokens) result['end_w_open_curly'] = end_w_open_curly(tokens) result['new_scope'] = result['end_w_open_curly'] if code == '{': result['line_type'] = 'open_curly_only' elif code == '}': result['line_type'] = 'close_curly_only' elif code == '': result['line_type'] = 'empty' elif code == ';': result['line_type'] = 'line' return result
def parse_dowhile(code): while_idx = code.index('while') do_part, while_part = code[:while_idx], code[while_idx:] do_result = parse_doline(do_part) while_result = parse_whileline(while_part) result = OrderedDict() for key in do_result: result[key] = do_result[key] for key in while_result: result[key] = while_result[key] tokens = filter_space(tokenize(round_trip(code), new_line=False)) result['code'] = code result['start_w_close_curly'] = start_w_close_curly(tokens) result['end_w_open_curly'] = end_w_open_curly(tokens) result['new_scope'] = False return result
def decompose_line(code): if not line_well_formed_brace(code): return None code = round_trip(code) if code in trivial_code: return return_result_for_trivial_code(code) if code[-1] == ':': return return_result_for_line_marker(code) result = None has_do, has_while, has_if, has_for, has_else = [ has_key_word(code, kword) for kword in ['do', 'while', 'if', 'for', 'else'] ] if has_if: result = parse_ifline(code) result['line_type'] = 'if' if not has_else else 'else if' elif has_else: result = parse_elseline(code) result['line_type'] = 'else' elif has_while and not has_do: result = parse_whileline(code) result['line_type'] = 'while' elif has_for: result = parse_forline(code) result['line_type'] = 'for' elif has_do and not has_while: result = parse_doline(code) result['line_type'] = 'do' elif has_do and has_while: result = parse_dowhile(code) result['line_type'] = 'dowhile' if result is not None: result = post_process_decomposition(result, 1) else: result = parse_simpleline(code) result = post_process_decomposition(result, 0) result['line_type'] = 'line' if result is not None and debug: for key in result: if 'stmt' in key: print(key, result[key]) input() return result
def type_line(line_code): line_code = round_trip(line_code) atoms_declared, atoms_used, prototype = {}, {}, None forest = [] try: try: tokens = tokenize(line_code, new_line=False) result = parse_func_header(tokens) line_type = 'prototype' if result['is_prototype'] else 'function' prototype = adddepth2func(result, atoms_declared) sw_close_curly, ew_open_curly = start_w_close_curly( tokens), end_w_open_curly(tokens) except ParseChunkError: result = decompose_line(line_code) if result is None: return None line_type = result['line_type'] sw_close_curly, ew_open_curly = result[ 'start_w_close_curly'], result['end_w_open_curly'] for key in result: if 'stmt' in key: stmt, depth = result[key] stmt = tokenize(stmt, new_line=False) segment_info = parse_chunk(stmt) addvar_decl2line(segment_info, atoms_declared, depth) parse_var_used(segment_info['nodes'], atoms_used, depth) forest += segment_info['nodes'] return { # used for consider scope 'line_type': line_type, 'start_w_close_curly': sw_close_curly, 'end_w_open_curly': ew_open_curly, 'line_complete': len(line_code) > 0 and line_code[-1] in ('}', ';'), 'atoms_declared': atoms_declared, 'atoms_used': atoms_used, 'prototype': prototype, 'forest': forest, 'code': line_code } except ParseChunkError: return None
def obtain_gold_program(f_name): src_file_path = program_dir + f_name + '.cc' with open(src_file_path, 'r') as in_file: program_str = in_file.read() program_by_line = [round_trip(c) for c in program_str.split('\n')] return program_by_line
def search(translation_map: Callable[[str], Tuple[List[List[str]], List[List[float]]]], # see documentation above, generate code pieces and scores program_dict: Dict[str, Any], # a dictionary that contains information needed for a program, including pseudo code, indent, etc result_dir: str, # the directory to dump the results budget: int, # budget B search_opt: str, # the constraint we use for searching, structure_beam_size: int = 50, # beam width W for the search structure_topk: int = 20, # the top K scaffolds we use for the search regular: bool = False # whether to use hierarchical or regular beam search ): # load program information f_name, indent = program_dict['f_name'], program_dict['indent'] program_length = len(indent) # evaluation requires running on a lot of testcases and is time consuming # we memoize all the evaluation results and save it on the disk memo_dir = '../spoc/eval_memo/' + f_name memo = {} if os.path.exists(memo_dir): memo = pkl.load(open(memo_dir, 'rb')) # the id of the problem (for testcases) is the substring after the 1st - pid = f_name.split('-')[1] # the path we are dumping the search results and statistics search_result_dir = result_dir + f_name + '.pkl' search_stats_result_dir = result_dir + f_name + '.stats' # if the result path already exists, return # else dump a lock to indicate that currently this process is working on it if os.path.exists(search_result_dir): return pkl.dump('working', open(search_result_dir, 'wb')) if verbose: print('searching for file %s.' % f_name) # check whether the gold program can pass the constraint gold_sents_l, gold_scores_l = [[round_trip(c)] for c in program_dict['program_by_line']], [[0] for _ in range(program_length)] gold_groups = search_structured_groups(gold_sents_l, gold_scores_l, search_opt, indent)['groups'] if gold_groups is not None: gold_passed = True else: gold_passed = False # load the translation translations = translation_map(f_name) sents_l, scores_l = translations # search the scaffold if not regular: search_info = search_structured_groups(sents_l, scores_l, search_opt, indent, beam_size=structure_beam_size, top_k=structure_topk) else: search_info = search_structured_groups(sents_l, scores_l, search_opt, indent, beam_size=budget * 2, top_k=budget, use_code=True) # if regular beam search # groups is a list of full candidate programs # if hierarchical beam search # group is a list, each element represent a scaffold, # where each scaffold is Tuple[List[List[str]], List[List[float]], float], the first two element # the same return type as translations, the third element is the score of a scaffold # every translation within the same scaffold has the same configuration for each line. groups = search_info['groups'] if groups is None: pkl.dump([], open(search_result_dir, 'wb')) return [] # next_code returns an iterator that generates the next candidate if regular: def next_code(): idx = 0 while True: if idx < len(groups): yield groups[idx] idx += 1 else: yield None else: def next_code(): mpq = Multipq(groups) while True: yield mpq.pop() cur_idx = 0 return_val = [] code_iter = next_code() while cur_idx < budget: code = next(code_iter) if code is None: break # check whether a piece of code has been evaluated in the history # if yes, then directly load the result and hence avoid computation if memo.get(code) is None: # if the braces do not match (e.g. more '{' than '}' in the program), then reject directly if braces_acceptable(code): j = Judge(problem_id=pid, judge_type='all', eager=True, judge_id=f_name + str(cur_idx)) result = j.judge_program_str(code) return_val.append({'rank': cur_idx, 'code': code, 'status': result['Status'], 'gold_pass': gold_passed}) memo[code] = result['Status'] else: return_val.append({'rank': cur_idx, 'code': code, 'status': 'braces rejected', 'gold_pass': gold_passed}) else: return_val.append({'rank': cur_idx, 'code': code, 'status': memo[code], 'gold_pass': gold_passed}) if code in memo and memo[code] == 'Passed': break cur_idx += 1 # dump the search results and the memo pkl.dump(return_val, open(search_result_dir, 'wb')) pkl.dump(search_info, open(search_stats_result_dir, 'wb')) pkl.dump(memo, open(memo_dir, 'wb')) return return_val