def do_stuff(data, input_text): names = tuple(sorted([(file_byte_index(key), key) for key in data.keys()])) if len(names) == 0: return tuple() assert [name[0] for name in names] == range(len(names)) contexts = [] text_compartments = [] cur_start = 0 for i in range(1, len(names) - 1): before = names[i - 1][1] current = names[i][1] after = names[i + 1][1] pred_sim = data[before][current]['similarity'] self_sim = data[current][current]['similarity'] succ_sim = data[current][after]['similarity'] should_be_explored = self_sim > succ_sim more_interesting_than_pred = self_sim > data[before][before]['similarity'] more_interesting_than_succ = self_sim > data[before][before]['similarity'] unique_checks = self_sim > max(pred_sim, succ_sim) seperator = succ_sim == pred_sim and unique_checks poi = unique_checks and pred_sim != succ_sim compartment_start = pred_sim < succ_sim #compartment_start2 = data[before][before][0] < data[current][current]['similarity'] #compartment_start3 = data[before][before][0] != data[current][current]['similarity'] #compartment_start4 = data[before][before][0] > data[current][current]['similarity'] #compartment_start4 = data[current][current][0] > data[after][after]['similarity'] compartment_end = succ_sim < self_sim context = {'val': input_text[i:i+1], 'pred': pred_sim, 'self': self_sim, 'succ': succ_sim, 'compart_start': 1 if compartment_start else 0, 'sep': 1 if seperator else 0, } contexts.append(context) return tuple(contexts), tuple(text_compartments)
def dump_table(data, input_text): def print_entries(entries, fmt = '{:>4}'): print ', '.join([fmt.format(e) for e in entries]) print_entries([''] * 3 + ['{:02x}'.format(ord(c)) for c in input_text]) print_entries([''] * 3 + [repr(c) for c in input_text]) print for one in sorted(data.keys(), key=file_byte_index): entries = [] input_char = input_text[file_byte_index(one)] entries.append('{}'.format(repr(input_char))) entries.append('{:02x}'.format(ord(input_char))) entries.append('') for two in sorted(data[one].keys(), key=file_byte_index): val = data[one][two][0] entries.append(val) print_entries(entries)
def make_unique_compartment_constraint_sequence(var_constraints, start_inc, end_exc): comp_const_seq = [] for var in sorted(var_constraints.keys()): if not (start_inc <= file_byte_index(var) < end_exc): continue prev = comp_const_seq[-1] if len(comp_const_seq) != 0 else None # No previous entry to compare against, unique! if prev is not None and len(prev) == len(var_constraints[var]): unique_const = False for a, b in zip(prev, var_constraints[var]): if hash(a) != hash(b): unique_const = True if not unique_const: continue comp_const_seq.append(tuple(var_constraints[var])) return tuple(comp_const_seq)
def dump_compartments(data, input_text): names = tuple(sorted([(file_byte_index(key), key) for key in data.keys()])) if len(names) == 0: return tuple() assert [name[0] for name in names] == range(len(names)) #compartment_start = #for i in range(1, ) compartments = {} for i_one, one in names: zipped = [] for i_two, two in names: if i_two < i_one: continue partial_compartments = tuple(split(lambda a, b: a[2] < b[2], zipped)) compartments.append(tuple(partial_compartments)) compartments = list(set(compartments)) return compartments
def extract_input_constraint_similarities(path, input_text): constraints = defaultdict(set) vars = set() for c in path.guards: if c.is_true(): continue if not is_reasonable_constraint(c): print "IGNORING unreasonable constraint of depth {}".format( c.depth) continue print "HANDLING reasonable constraint of depth {}: {}".format( c.depth, c) vars.update(c.variables) for var in c.variables: mapping, dummied_constraints = dummy_out_vars((c, )) constraints[var].update(dummied_constraints) if len(vars) == 0: return constraints, [] var_names = [None] * len(input_text) for v in vars: var_names[file_byte_index(v)] = v #assert all(file_byte_index(vars[i]) == i for i in range(len(vars))) similarity = tuple( tuple( len(constraints[one].intersection(constraints[two])) for two in var_names) for one in var_names) if None in constraints: del constraints[None] return constraints, similarity
def extract_input_characteristics(proj, final_path, input_text, var_before_touch_state_map, string_classification_data): var_constraints, similarities, comp_descriptors = extract_input_compartment_descriptors( final_path, input_text) comp_set = set([c['const_seq'] for c in comp_descriptors]) print len(comp_set), '/', len(comp_descriptors) other_opts = {} for comp in comp_descriptors: if comp['const_seq'] in other_opts and other_opts[ comp['const_seq']] is not None: continue var_idx_to_state_mapping = { file_byte_index(k): v for k, v in var_before_touch_state_map.iteritems() } #import ipdb; ipdb.set_trace() by_runs = defaultdict(list) for var in range(comp['start'], comp['end']): if var in var_idx_to_state_mapping: for r, g, s in var_idx_to_state_mapping[var]: by_runs[r].append((g, s)) first_idx = -1 for idx in range(comp['start'], comp['end']): if idx in var_idx_to_state_mapping: first_idx = idx break if first_idx == -1: other_opts[comp['const_seq']] = None continue # We can't do anything with this compartment! interesting_guard_creators = var_idx_to_state_mapping[first_idx] run, guard, state = interesting_guard_creators[-1] removed_guards = [] for r in sorted(by_runs.keys(), reverse=True): if r > run: continue still_matching = False for g, s in by_runs[r]: #if any(comp['start'] <= file_byte_index(v) < comp['end'] for v in g.variables): if any(comp['start'] == file_byte_index(v) for v in g.variables): # still constraining our input, we're good! run, guard, state = r, g, s still_matching = True break if not still_matching: break removed_guards.append(guard) #import ipdb; ipdb.set_trace() #print "Finding other options when ignoring {}".format(removed_guards) other_string_options = extract_other_options( proj, state, comp, input_text, string_classification_data) other_opts[comp['const_seq']] = other_string_options for comp in comp_descriptors: comp['other_options'] = other_opts[comp['const_seq']] return var_constraints, similarities, comp_descriptors, other_opts
def is_interesting_guard(g): if g is None or g.is_true(): return False return any(comp_descriptor['start'] <= file_byte_index(v) < comp_descriptor['end'] for v in g.variables)
def get_chr_idx_to_constraints_mapping(guards): chr_idx_to_constraints = defaultdict(list) for c in guards: for var in c.variables: chr_idx_to_constraints[file_byte_index(var)].append(c) return chr_idx_to_constraints