def test_vector_hash(): cg = ChangeGraph() cn1 = ChangeNode(None, None, 'M1', ChangeNode.Kind.OPERATION_NODE, 0, sub_kind=ChangeNode.SubKind.OP_FUNC_CALL) cn2 = ChangeNode(None, None, '=', ChangeNode.Kind.OPERATION_NODE, 0, sub_kind=ChangeNode.SubKind.OP_ASSIGNMENT) cn3 = ChangeNode(None, None, 'M2', ChangeNode.Kind.OPERATION_NODE, 1, sub_kind=ChangeNode.SubKind.OP_FUNC_CALL) cn4 = ChangeNode(None, None, '=', ChangeNode.Kind.OPERATION_NODE, 1, sub_kind=ChangeNode.SubKind.OP_ASSIGNMENT) ChangeEdge.create(LinkType.MAP, cn1, cn3) ChangeEdge.create(LinkType.MAP, cn2, cn4) ChangeEdge.create(LinkType.PARAMETER, cn1, cn2) ChangeEdge.create(LinkType.PARAMETER, cn3, cn4) cg.nodes.update([cn1, cn2, cn3, cn4]) fr = Fragment.create_from_node_pair([cn1, cn3]) ext_fr = Fragment.create_extended(fr, ext_nodes=(cn2, cn4)) vector_hash = ext_fr.vector.get_hash() assert vector_hash == normalize(27320942899360)
def _mine(self, graphs): # TODO: delete assign nodes? # TODO: collapse literals? label_to_node_pairs = {} for graph in graphs: if self.MIN_DATE and graph.repo_info.commit_dtm < self.MIN_DATE: continue for node in graph.nodes: if node.version != ChangeNode.Version.BEFORE_CHANGES or not node.mapped: continue if not (node.kind == ChangeNode.Kind.OPERATION_NODE and node.sub_kind == ChangeNode.SubKind.OP_FUNC_CALL): # or node.kind == ChangeNode.Kind.CONTROL_NODE): continue label = f'{node.label}~{node.mapped.label}' arr = label_to_node_pairs.setdefault(label, []) arr.append((node, node.mapped)) logger.warning(f'Total pairs after the first step = {len(label_to_node_pairs.values())}') for num, pairs in enumerate(label_to_node_pairs.values()): logger.warning(f'Looking at node pair #{num + 1}') if len(pairs) < Pattern.MIN_FREQUENCY: logger.warning('Skipping...') continue fragments = set([Fragment.create_from_node_pair(pair) for pair in pairs]) pattern = Pattern(fragments, len(fragments)) pattern = pattern.extend() if pattern.is_change() and pattern.size >= self.MIN_PATTERN_SIZE: self.add_pattern(pattern) logger.warning(f'Pattern #{pattern.id} with size {pattern.size} was added') logger.warning(f'Done looking at node pair #{num + 1}') logger.warning(f'Done patterns\' mining, total count = {self._patterns_cnt}') self._filter_patterns() logger.warning(f'Done filtering, total count = {self._patterns_cnt}') if self.HIDE_OVERLAPPED_FRAGMENTS: logger.info('Removing overlapped fragments from patterns') for patterns in self._size_to_patterns.values(): for pattern in patterns: overlapped_fragments = Pattern.get_graph_overlapped_fragments(pattern.fragments) for fragment in overlapped_fragments: pattern.fragments.remove(fragment) logger.info('Done removing overlapped fragments from patterns')
def _get_freq_group(fr): max_freq = 0 freq_group = None label_to_ext_list = fr.get_label_to_ext_list() for label, ext_list in label_to_ext_list.items(): ext_fragments = set() for ext in ext_list: ext_fr = Fragment.create_extended(fr, ext) ext_fragments.add(ext_fr) groups = Fragment.create_groups(ext_fragments) for num, group in enumerate(groups): freq = len(group) if freq > max_freq: max_freq = freq freq_group = group logger.log(logger.DEBUG, f'Elements in group #{num + 1} -> {len(group)}') return freq_group
def test_fragment_label_to_ext_list(): cg = ChangeGraph() cn1 = ChangeNode(None, None, 'getZoneByName', ChangeNode.Kind.OPERATION_NODE, 0, sub_kind=ChangeNode.SubKind.OP_FUNC_CALL) cn2 = ChangeNode(None, None, '=', ChangeNode.Kind.OPERATION_NODE, 0, sub_kind=ChangeNode.SubKind.OP_ASSIGNMENT) cn3 = ChangeNode(None, None, 'var', ChangeNode.Kind.DATA_NODE, 0, sub_kind=ChangeNode.SubKind.DATA_VARIABLE_DECL) cn4 = ChangeNode(None, None, 'getSettings', ChangeNode.Kind.OPERATION_NODE, 0, sub_kind=ChangeNode.SubKind.OP_FUNC_CALL) cn5 = ChangeNode(None, None, '=', ChangeNode.Kind.OPERATION_NODE, 0, sub_kind=ChangeNode.SubKind.OP_ASSIGNMENT) cn6 = ChangeNode(None, None, 'var', ChangeNode.Kind.DATA_NODE, 0, sub_kind=ChangeNode.SubKind.DATA_VARIABLE_DECL) cn7 = ChangeNode(None, None, 'update', ChangeNode.Kind.OPERATION_NODE, 0, sub_kind=ChangeNode.SubKind.OP_FUNC_CALL) ChangeEdge.create(LinkType.PARAMETER, cn1, cn2) # getZoneByName -para> = ChangeEdge.create(LinkType.DEFINITION, cn2, cn3) # = -def> var ChangeEdge.create(LinkType.DEFINITION, cn1, cn3) # getZoneByName -def> var ChangeEdge.create(LinkType.PARAMETER, cn4, cn5) # getSettings -para> = ChangeEdge.create(LinkType.DEFINITION, cn4, cn6) # getSettings -para> var ChangeEdge.create(LinkType.DEFINITION, cn5, cn6) # = -para> var ChangeEdge.create(LinkType.PARAMETER, cn4, cn7) # = -para> update ChangeEdge.create(LinkType.PARAMETER, cn5, cn7) # = -para> update ChangeEdge.create(LinkType.PARAMETER, cn6, cn7) # = -para> update c2n1 = ChangeNode(None, None, 'get_fw_zone_settings', ChangeNode.Kind.OPERATION_NODE, 1, sub_kind=ChangeNode.SubKind.OP_FUNC_CALL) c2n2 = ChangeNode(None, None, '=', ChangeNode.Kind.OPERATION_NODE, 1, sub_kind=ChangeNode.SubKind.OP_ASSIGNMENT) c2n3 = ChangeNode(None, None, 'var', ChangeNode.Kind.DATA_NODE, 1, sub_kind=ChangeNode.SubKind.DATA_VARIABLE_DECL) c2n4 = ChangeNode(None, None, 'var', ChangeNode.Kind.DATA_NODE, 1, sub_kind=ChangeNode.SubKind.DATA_VARIABLE_USAGE) c2n5 = ChangeNode(None, None, 'var', ChangeNode.Kind.DATA_NODE, 1, sub_kind=ChangeNode.SubKind.DATA_VARIABLE_DECL) c2n6 = ChangeNode(None, None, 'update_fw_settings', ChangeNode.Kind.OPERATION_NODE, 1, sub_kind=ChangeNode.SubKind.OP_FUNC_CALL) ChangeEdge.create(LinkType.PARAMETER, c2n1, c2n2) # get_fw_zone_settings -para> = ChangeEdge.create(LinkType.PARAMETER, c2n2, c2n6) # = -para> update_fw_settings ChangeEdge.create(LinkType.DEFINITION, c2n2, c2n3) # = -def> var ChangeEdge.create(LinkType.DEFINITION, c2n2, c2n5) # = -def> var ChangeEdge.create(LinkType.REFERENCE, c2n3, c2n4) # var -ref> var ChangeEdge.create(LinkType.PARAMETER, c2n3, c2n6) # var -para> update_fw_settings ChangeEdge.create(LinkType.PARAMETER, c2n4, c2n6) # var -para> update_fw_settings ChangeEdge.create(LinkType.PARAMETER, c2n5, c2n6) # var -para> update_fw_settings # --- ChangeEdge.create(LinkType.MAP, cn1, c2n1) # getZoneByName -> get_fw_zone_settings ChangeEdge.create(LinkType.MAP, cn2, c2n2) # = -> = ChangeEdge.create(LinkType.MAP, cn7, c2n6) # update -> update_fw_settings cn1.mapped = c2n1 c2n1.mapped = cn1 cn2.mapped = c2n2 c2n2.mapped = cn2 cn7.mapped = c2n6 c2n6.mapped = cn7 cg.nodes.update([cn1, cn2, cn3, cn4, cn5, cn6, cn7]) cg.nodes.update([c2n1, c2n2, c2n3, c2n4, c2n5, c2n6]) # 4 v1 get_fw_zone_settings operation.method-call # 5 v0 getZoneByName operation.method-call # 6 v0 getSettings operation.method-call # 7 v1 var data.variable-decl # 8 v1 = operation.assignment # 9 v0 = operation.assignment # 10 v0 = operation.assignment # 11 v1 var data.variable-decl # 1 v0 update operation.method-call # 12 v0 var data.variable-decl # 2 v1 update_fw_settings operation.method-call # 13 v1 var data.variable-usage # 3 v0 var data.variable-decl # for node in cg.nodes: # logger.log(logger.WARNING, node) # # export_graph_image(cg) # --- --- --- --- --- fr = Fragment.create_from_node_pair([cn1, c2n1]) group = _get_freq_group(fr) p = Pattern(group, freq=None) print(p.fragments)
def main(): logger.info('------------------------------ Starting ------------------------------') if settings.get('use_stackimpact', required=False): _ = stackimpact.start( agent_key=settings.get('stackimpact_agent_key'), app_name='CodeChangesMiner', debug=True, app_version=str(datetime.datetime.now()) ) sys.setrecursionlimit(2**31-1) multiprocessing.set_start_method('spawn', force=True) parser = argparse.ArgumentParser() parser.add_argument('mode', help=f'One of {RunModes.ALL}', type=str) args, _ = parser.parse_known_args() current_mode = args.mode if current_mode == RunModes.BUILD_PY_FLOW_GRAPH: parser.add_argument('-i', '--input', help='Path to source code file', type=str, required=True) parser.add_argument('-o', '--output', help='Path to output file', type=str, default='pyflowgraph.dot') parser.add_argument('--no-closure', action='store_true') parser.add_argument('--show-deps', action='store_true') parser.add_argument('--hide-op-kinds', action='store_true') parser.add_argument('--show-data-keys', action='store_true') args = parser.parse_args() fg = pyflowgraph.build_from_file( args.input, show_dependencies=args.show_deps, build_closure=not args.no_closure) pyflowgraph.export_graph_image( fg, args.output, show_op_kinds=not args.hide_op_kinds, show_data_keys=args.show_data_keys) elif current_mode == RunModes.BUILD_CHANGE_GRAPH: parser.add_argument('-s', '--src', help='Path to source code before changes', type=str, required=True) parser.add_argument('-d', '--dest', help='Path to source code after changes', type=str, required=True) parser.add_argument('-o', '--output', help='Path to output file', type=str, default='changegraph.dot') args = parser.parse_args() fg = changegraph.build_from_files(args.src, args.dest) changegraph.export_graph_image(fg, args.output) elif current_mode == RunModes.COLLECT_CHANGE_GRAPHS: GitAnalyzer().build_change_graphs() elif current_mode == RunModes.MINE_PATTERNS: parser.add_argument('-s', '--src', help='Path to source code before changes', type=str, nargs='+') parser.add_argument('-d', '--dest', help='Path to source code after changes', type=str, nargs='+') parser.add_argument('--fake-mining', action='store_true') args = parser.parse_args() if args.src or args.dest or args.fake_mining: if not args.src or len(args.src) != len(args.dest): raise ValueError('src and dest have different size or unset') change_graphs = [] for old_path, new_path in zip(args.src, args.dest): methods = [] for n, path in enumerate([old_path, new_path]): with open(path, 'r+') as f: src = f.read() methods.append(Method(path, 'test_name', ast.parse(src, mode='exec').body[0], src)) mock_commit_dtm = datetime.datetime.now(tz=datetime.timezone.utc) repo_info = RepoInfo( 'mock repo path', 'mock repo name', 'mock repo url', 'mock hash', mock_commit_dtm, 'mock old file path', 'mock new file path', methods[0], methods[1]) cg = changegraph.build_from_files(old_path, new_path, repo_info=repo_info) change_graphs.append(cg) miner = Miner() if args.fake_mining: for cg in change_graphs: fragment = Fragment() fragment.graph = cg fragment.nodes = cg.nodes pattern = Pattern([fragment]) miner.add_pattern(pattern) else: miner.mine_patterns(change_graphs) miner.print_patterns() else: storage_dir = settings.get('change_graphs_storage_dir') file_names = os.listdir(storage_dir) logger.warning(f'Found {len(file_names)} files in storage directory') change_graphs = [] for file_num, file_name in enumerate(file_names): file_path = os.path.join(storage_dir, file_name) try: with open(file_path, 'rb') as f: graphs = pickle.load(f) for graph in graphs: change_graphs.append(pickle.loads(graph)) except: logger.warning(f'Incorrect file {file_path}') if file_num % 1000 == 0: logger.warning(f'Loaded [{1+file_num}/{len(file_names)}] files') logger.warning('Pattern mining has started') miner = Miner() try: miner.mine_patterns(change_graphs) except KeyboardInterrupt: logger.warning('KeyboardInterrupt: mined patterns will be stored before exit') miner.print_patterns() else: raise ValueError