def which_fix_goes_first(program, fix1, fix2): try: fix1_location = extract_line_number(' '.join(fix1.split()[1:])) fix2_location = extract_line_number(' '.join(fix2.split()[1:])) except Exception: raise if not fix_ids_are_in_program( recompose_program(get_lines(program)[fix2_location:]), fix2) and fix_ids_are_in_program( recompose_program(get_lines(program)[fix1_location:]), fix1): return fix1 if not fix_ids_are_in_program( recompose_program(get_lines(program)[fix1_location:]), fix1) and fix_ids_are_in_program( recompose_program(get_lines(program)[fix2_location:]), fix2): return fix2 if not fix_ids_are_in_program( recompose_program(get_lines(program)[fix1_location:]), fix1) and not fix_ids_are_in_program( recompose_program(get_lines(program)[fix2_location:]), fix2): raise CouldNotFindUsesForEitherException if fix1_location < fix2_location: return fix1 elif fix2_location < fix1_location: return fix2 prog_lines = get_lines(program) id_in_fix1 = None id_in_fix2 = None for token in fix1.split(): if '_<id>_' in token: assert id_in_fix1 is None, fix1 id_in_fix1 = token elif token == '_<op>_[': break for token in fix2.split(): if '_<id>_' in token: assert id_in_fix2 is None, fix2 id_in_fix2 = token elif token == '_<op>_[': break assert id_in_fix1 != id_in_fix2, fix1 + ' & ' + fix2 assert fix1_location == fix2_location for i in range(fix1_location, len(prog_lines)): for token in prog_lines[i].split(): if token == id_in_fix1: return fix1 elif token == id_in_fix2: return fix2 assert False, 'unreachable code' raise CouldNotFindUsesForEitherException
def find_and_replace(self, org_prog, corrupted_prog, regex, replacement='', name='', include_kind=False): # for pointer mutate if regex == '[^)@,#\]] (_<op>_\*)(?! _<number>_)': positions = [m.span(1) for m in re.finditer(regex, corrupted_prog)] else: positions = [m.span() for m in re.finditer(regex, corrupted_prog)] if len(positions) > 1: to_corrupt = np.random.randint(len(positions)) elif len(positions) == 1: to_corrupt = 0 elif include_kind: return corrupted_prog, None, None, None else: return corrupted_prog, None, None corrupted_prog = corrupted_prog[:positions[to_corrupt][0]] + replacement + corrupted_prog[positions[to_corrupt][1]:] fix = isolate_line(org_prog, positions[to_corrupt][0]) line = extract_line_number(fix) if include_kind: return corrupted_prog, fix, line, name else: return corrupted_prog, fix, line
def meets_criterion(incorrect_program_tokens, fix, name_dict, type_, name_seq=None, silent=True): lines = get_lines(incorrect_program_tokens) fix = _truncate_fix(fix) if _is_stop_signal(fix): #print 'is stop signal' return False try: fix_line_number = extract_line_number(fix) except Exception: #print 'failed to extract line number from fix' return False if fix_line_number >= len(lines): #print 'localization is pointing to line that doesn\'t exist' return False fix_line = lines[fix_line_number] # Make sure number of IDs is the same if len(re.findall('_<id>_\w*', fix_line)) != len(re.findall('_<id>_\w*', fix)): if not silent: print 'number of ids is not the same' return False keywords_regex = '_<keyword>_\w+|_<type>_\w+|_<APIcall>_\w+|_<include>_\w+' if type_ == 'replace' and re.findall(keywords_regex, fix_line) != re.findall(keywords_regex, fix): if not silent: print 'important words (keywords, etc.) change drastically' return False return True
def meets_criterion(incorrect_program_tokens, fix, type_, silent=True): lines = get_lines(incorrect_program_tokens) fix = _truncate_fix(fix) if _is_stop_signal(fix): return False try: fix_line_number = extract_line_number(fix) except FailedToGetLineNumberException: return False if fix_line_number >= len(lines): return False fix_line = lines[fix_line_number] # Make sure number of IDs is the same if len(re.findall('_<id>_\w*', fix_line)) != len( re.findall('_<id>_\w*', fix)): if not silent: print 'number of ids is not the same' return False keywords_regex = '_<keyword>_\w+|_<type>_\w+|_<APIcall>_\w+|_<include>_\w+' if type_ == 'replace' and re.findall( keywords_regex, fix_line) != re.findall(keywords_regex, fix): if not silent: print 'important words (keywords, etc.) change drastically' return False return True
def apply_fix(program, fix, kind='replace', check_literals=False): # Break up program string into lines lines = get_lines(program) # Truncate the fix fix = _truncate_fix(fix) # Make sure there are two parts if len(fix.split('~')) != 2: raise InvalidFixLocationException # Retrieve insertion location try: if kind == 'replace': fix_location = extract_line_number(fix) else: assert kind == 'insert' if fix.split()[0] != '_<insertion>_': print "Warning: First token did not suggest insertion (should not happen)" fix_location = extract_line_number(' '.join(fix.split()[1])) except FailedToGetLineNumberException: raise InvalidFixLocationException # Remove line number fix = _remove_line_number(fix) # Insert the fix if kind == 'replace': try: if lines[fix_location].count('_<id>_') != fix.count('_<id>_'): raise SubstitutionFailedException if check_literals: for lit in ['string', 'char', 'number']: if lines[fix_location].count('_<%s>' % lit) != fix.count('_<%s>_' % lit): raise SubstitutionFailedException lines[fix_location] = replace_ids(fix, lines[fix_location]) except IndexError: raise InvalidFixLocationException else: assert kind == 'insert' lines.insert(fix_location+1, fix) return recompose_program(lines)
def find_and_replace(org_prog, corrupted_prog, regex, replacement, extra_ids, last_id): positions = [m.span() for m in re.finditer(regex, corrupted_prog)] if len(positions) > 1: to_corrupt = np.random.randint(len(positions)) elif len(positions) == 1: to_corrupt = 0 else: return corrupted_prog, None, None, extra_ids, last_id corrupted_prog = corrupted_prog[:positions[to_corrupt][0]] + replacement + corrupted_prog[positions[to_corrupt][1]:] fix = isolate_line(org_prog, positions[to_corrupt][0]) line = extract_line_number(org_prog, positions[to_corrupt][0]) return corrupted_prog, fix, line, extra_ids, last_id
def find_and_replace(self, corrupted_prog, regex, replacement, mutation_name): positions = [m.span() for m in re.finditer(regex, corrupted_prog)] if len(positions) > 1: to_corrupt = self.rng.randint(len(positions)) elif len(positions) == 1: to_corrupt = 0 else: return corrupted_prog, None, mutation_name line_number = extract_line_number( isolate_line(corrupted_prog, positions[to_corrupt][0])) corrupted_prog = corrupted_prog[:positions[to_corrupt][0]] + \ replacement + corrupted_prog[positions[to_corrupt][1]:] return corrupted_prog, line_number, mutation_name
def find_and_replace(self, org_prog, corrupted_prog, regex, replacement, mutation_name): # special handling for pointer mutate if regex == '[^)@,#\]] (_<op>_\*)(?! _<number>_)': positions = [m.span(1) for m in re.finditer(regex, corrupted_prog)] else: positions = [m.span() for m in re.finditer(regex, corrupted_prog)] if len(positions) > 1: to_corrupt = self.rng.randint(len(positions)) elif len(positions) == 1: to_corrupt = 0 else: return corrupted_prog, None, mutation_name line_number = extract_line_number(isolate_line(corrupted_prog, positions[to_corrupt][0])) corrupted_prog = corrupted_prog[:positions[to_corrupt][0]] + replacement + corrupted_prog[positions[to_corrupt][1]:] return corrupted_prog, line_number, mutation_name
def process(self, source_code_array, max_attempts=6): sequences_of_programs = {} fixes_suggested_by_network = {} entries = [] entries_ids = [] errors = {} fixes_to_return = {} error_messages = {} # Wrap it up into a nice box for idx, source_code in enumerate(source_code_array): program, name_dict, name_sequence, literal_sequence = C_Tokenizer().tokenize(source_code) entries.append((idx, program, name_dict, name_sequence, literal_sequence)) entries_ids.append((idx, program, name_dict, name_sequence, literal_sequence)) sequences_of_programs[idx] = [program] fixes_suggested_by_network[idx] = [] errors[idx], _ = compilation_errors(source_code) error_messages[idx] = [] fixes_to_return[idx] = [] network = self.network if self.task == 'ids': normalize_names = False fix_kind = 'insert' else: assert self.task == 'typo' normalize_names = True fix_kind = 'replace' # Reinitialize `entries' entries = entries_ids try: for round_ in range(max_attempts): to_delete = [] input_ = [] for i, entry in enumerate(entries): idx, program, name_dict, name_sequence, literal_sequence = entry try: program_vector = vectorize(sequences_of_programs[idx][-1], network['in_seq_length'], network['dictionary'], normalize_names=normalize_names, reverse=True, append_eos=False) except VectorizationFailedException: program_vector = None if program_vector is not None: input_.append(program_vector) else: to_delete.append(i) error_messages[idx].append('VectorizationFailed') # Delete to_delete = sorted(to_delete)[::-1] for i in to_delete: del entries[i] assert len(input_) == len(entries) if len(input_) == 0: break # Pass it through the network fix_vectors = get_fixes(network['session'], input_, network) fixes = [] # Devectorize them for i, fix_vector in enumerate(fix_vectors): idx, _, _, _, _ = entries[i] fix = devectorize(fix_vector, network['dictionary']) fixes_suggested_by_network[idx].append(fix) fixes.append(fix) to_delete = [] # Apply fixes for i, entry, fix in zip(range(len(fixes)), entries, fixes): idx, program, name_dict, name_sequence, literal_sequence = entry try: program = sequences_of_programs[idx][-1] program = apply_fix(program, fix, kind=fix_kind, check_literals=True) sequences_of_programs[idx].append(program) regen_source_code = tokens_to_source(program, name_dict, clang_format=True, literal_seq=literal_sequence) this_errors, _ = compilation_errors(regen_source_code) if len(fix.strip().split()) > 0 and len(this_errors) > len(errors[idx]): to_delete.append(i) error_messages[idx].append('ErrorsIncreased') else: errors[idx] = this_errors except IndexError: to_delete.append(i) error_messages[idx].append('IndexError') except VectorizationFailedException as e: to_delete.append(i) error_messages[idx].append('VectorizationFailed') except InvalidFixLocationException: to_delete.append(i) if fix.strip().split()[0] == '_eos_': error_messages[idx].append('OK') else: error_messages[idx].append('InvalidFixLocation') except SubstitutionFailedException: to_delete.append(i) error_messages[idx].append('SubstitutionFailed') else: assert len(fix.strip().split()) == 0 or fix.strip().split()[0] != '_eos_' if fix_kind == 'insert': fix_ = ' '.join(fix.split()[1:]) fix_line = extract_line_number(fix_) + 1 fixes_to_return[idx].append('%s at line %d: %s' % (fix_kind, fix_line, ''.join(fix_to_source(fix_, program, name_dict, clang_format=True).split('\n')))) else: fix_line = extract_line_number(fix) + 1 fixes_to_return[idx].append('%s at line %d: %s' % (fix_kind, fix_line, ''.join(fix_to_source(fix, program, name_dict, name_seq=name_sequence, literal_seq=literal_sequence, clang_format=True).split('\n')))) # Delete to_delete = sorted(to_delete)[::-1] for i in to_delete: del entries[i] except KeyError as e: pass except InvalidFixLocationException: pass except SubstitutionFailedException: pass # ----------- repaired_programs = {} for idx in sequences_of_programs: repaired_programs[idx] = tokens_to_source(sequences_of_programs[idx][-1], name_dict, clang_format=True, literal_seq=literal_sequence) repaired_programs[idx] = repaired_programs[idx].strip() return fixes_to_return, repaired_programs, error_messages
def apply_fix(program, fix, kind='replace', check_literals=False): print "apply_fix passed" # Break up program string into lines lines = get_lines(program) print "*******************" print "lines =" print lines print "*******************" print "lines length :", len(lines) # Truncate the fix fix = _truncate_fix(fix) print "*******************" print "fix =" print fix print "*******************" print "fix.split('~') :", fix.split('~') print "len(fix.split('~')) :", len(fix.split('~')) # Make sure there are two parts if len(fix.split('~')) != 2: print "InvalidFixLocationExeption" print "can not split 2 part" raise InvalidFixLocationException print "Retrieve insertion location" # Retrieve insertion location try: print "if replace 1" if kind == 'replace': fix_location = extract_line_number(fix) print "kind == replace" print "*******************" print "fix_location =" print fix_location print "*******************" else: assert kind == 'insert' if fix.split()[0] != '_<insertion>_': print "Warning: First token did not suggest insertion (should not happen)" fix_location = extract_line_number(' '.join(fix.split()[1])) print "*******************" print "fix_location ==" print fix_location print "*******************" except FailedToGetLineNumberException: raise InvalidFixLocationException print "Remove line number" # Remove line number fix = _remove_line_number(fix) print "*******************" print "fix =" print fix print "*******************" # Insert the fix if kind == 'replace': print "if replace 2" try: check_literals = False #debug if lines[fix_location].count('_<id>_') != fix.count('_<id>_'): print "not include original id" raise SubstitutionFailedException if check_literals: print "check literals" for lit in ['string', 'char', 'number']: if lines[fix_location].count('_<%s>' % lit) != fix.count( '_<%s>_' % lit): print "not include original literal" raise SubstitutionFailedException lines[fix_location] = replace_ids(fix, lines[fix_location]) except IndexError: print "InvalidFixLocationException" raise InvalidFixLocationException else: assert kind == 'insert' lines.insert(fix_location + 1, fix) print "apply_fix end" return recompose_program(lines)