def train_process(target_metric_code="M4"): """ Define needed columns for dataset and run model training """ config = Config.get_patterns_config() only_patterns = [ x['code'] for x in list(config['patterns']) if x['code'] not in config['patterns_exclude'] ] only_metrics = \ [x['code'] for x in list(config['metrics']) if x['code'] not in config['metrics_exclude']] \ + ['halstead volume'] columns_features = only_metrics + only_patterns features_number = len(columns_features) print("General number of features in config: ", features_number) train_dataset = pd.read_csv(Config.train_csv(), index_col=None) model = PatternRankingModel() # At the moment we use use patterns as features, # but in future features can be also some metrics. # We should differ them for any purpose (scaling, etc.) features_conf = { "features_order": only_patterns, "patterns_only": only_patterns } model.features_conf = features_conf print('Scaling features...') scaled_dataset = scale_dataset(train_dataset, model.features_conf, target_metric_code) dataset = scaled_dataset[only_patterns] print('Training model...') model.fit_regressor(dataset, scaled_dataset[target_metric_code]) save_model_file = Path(Config.folder_to_save_model_data(), 'model.pkl') print('Saving model to loaded model from file {}:'.format(save_model_file)) with open(save_model_file, 'wb') as fid: pickle.dump(model, fid) load_model_file = Path(Config.folder_to_save_model_data(), 'model.pkl') print('Test loaded model from file {}:'.format(load_model_file)) test_dataset = pd.read_csv(Config.test_csv(), index_col=None) with open(load_model_file, 'rb') as fid: model_new = pickle.load(fid) scaled_test_dataset = scale_dataset( test_dataset, model_new.features_conf, target_metric_code).sample(n=10, random_state=17) print('Model has been loaded successfully') # add ncss, ncss is needed in informative as a last column X_test = scaled_test_dataset[only_patterns + ['M2']] for _, row in X_test.iterrows(): preds, importances = model_new.rank(row.values) print(preds) path_with_logs = Path(os.getcwd(), 'catboost_info') print('Removing path with catboost logs {}'.format(path_with_logs)) if path_with_logs.exists(): shutil.rmtree(path_with_logs)
def calculate_patterns_and_metrics(file, args): code_lines_dict = input_params = {} # type: ignore error_exc = None patterns_to_suppress = args.suppress try: config = Config.get_patterns_config() for pattern in config['patterns']: if pattern['code'] in config['patterns_exclude']: continue if pattern['code'] in patterns_to_suppress: input_params[pattern['code']] = 0 else: __count_value(pattern, input_params, code_lines_dict, file) for metric in config['metrics']: if metric['code'] in config['metrics_exclude']: continue __count_value(metric, input_params, code_lines_dict, file, is_metric=True) except Exception as ex: error_exc = ex input_params = [] # type: ignore return input_params, code_lines_dict, error_exc
def collect_dataset(args): """ Run bash scripts to collect metrics and patterns for java files """ def make_patterns(args, cur_work_dir): print('Compute patterns...') result = subprocess.run(['make', 'patterns'], stdout=subprocess.PIPE, encoding='utf-8', cwd=cur_work_dir) print(result.returncode) if result.returncode != 0: print(result.stderr) exit(3) else: print(result.stdout) if args.dataset_file: dataset_file_path = Path(cur_work_dir, args.dataset_file) if not dataset_file_path.parent.exists(): dataset_file_path.parent.mkdir(parents=True) shutil.copy(Path(Config.dataset_file()), dataset_file_path) else: dataset_file_path = Path(Config.dataset_file()) print('dataset was saved to {}'.format(str(dataset_file_path.absolute()))) def run_cmd(metrics_cmd, cur_work_dir): result = subprocess.run(metrics_cmd, stdout=subprocess.PIPE, encoding='utf-8', cwd=cur_work_dir) if result.returncode != 0: print(result.stderr) exit(1) else: print(result.stdout) # path to java files which will be analyzed java_folder = args.java_folder max_classes = args.max_classes os.chdir(Path(Config.home_aibolit_folder(), 'scripts')) if not java_folder: java_folder = Path('target/01').absolute() print('Analyzing {} dir:'.format(java_folder)) cur_work_dir = Path(os.getcwd()) print('Current working directory: ', cur_work_dir) print('Directory with JAVA classes: ', java_folder) print('Filtering java files...') filter_cmd = ['make', 'filter'] merge_cmd = ['make', 'merge'] split_cmd = ['make', 'split'] if java_folder is not None: filter_cmd.append(f'dir={java_folder}') if max_classes is not None: filter_cmd.append(f'max_classes={max_classes}') run_cmd(filter_cmd, cur_work_dir) make_patterns(args, cur_work_dir) print('Merge results...') run_cmd(merge_cmd, cur_work_dir) print('Preprocess dataset, create train and test...') run_cmd(split_cmd, cur_work_dir)
def scale_dataset(df: pd.DataFrame, features_conf: Dict[Any, Any], scale_ncss=True) -> pd.DataFrame: config = Config.get_patterns_config() patterns_codes_set = set([x['code'] for x in config['patterns']]) metrics_codes_set = [x['code'] for x in config['metrics']] exclude_features = set(config['patterns_exclude']).union( set(config['metrics_exclude'])) used_codes = set(features_conf['features_order']) used_codes.add('M4') not_scaled_codes = set(patterns_codes_set).union(set( metrics_codes_set)).difference(used_codes).difference(exclude_features) features_not_in_config = set( df.columns).difference(not_scaled_codes).difference(used_codes) not_scaled_codes = sorted(not_scaled_codes.union(features_not_in_config)) codes_to_scale = sorted(used_codes) if scale_ncss: scaled_df = pd.DataFrame(df[codes_to_scale].values / df['M2'].values.reshape((-1, 1)), columns=codes_to_scale) not_scaled_df = df[not_scaled_codes] input = pd.concat([scaled_df, not_scaled_df], axis=1) else: input = df return input
def inference( input_params: List[int], code_lines_dict, args): """ Find a pattern which has the largest impact on target :param input_params: list if calculated patterns/metrics :param code_lines_dict: list with found code lines of patterns/metrics :param file: filename :return: list of results with code_lies for each pattern and its name """ model_path = args.model do_full_report = args.full results = [] if input_params: if not model_path: model_path = Config.folder_model_data() with open(model_path, 'rb') as fid: model = pickle.load(fid) sorted_result, importances = model.predict(input_params) patterns_list = model.features_conf['features_order'] for iter, (key, val) in enumerate(sorted_result.items()): if key in patterns_list: pattern_code = key code_lines = code_lines_dict.get('lines_' + key) importance = importances[iter] * input_params[pattern_code] # We show only patterns with positive importance if code_lines and importance > 0: if code_lines: pattern_name = \ [x['name'] for x in Config.get_patterns_config()['patterns'] if x['code'] == pattern_code][0] results.append( {'code_lines': code_lines, 'pattern_code': pattern_code, 'pattern_name': pattern_name, 'importance': importance }) if not do_full_report: break else: return results return results
def make_patterns(args, cur_work_dir): print('Compute patterns...') result = subprocess.run(['make', 'patterns'], stdout=subprocess.PIPE, encoding='utf-8', cwd=cur_work_dir) print(result.returncode) if result.returncode != 0: print(result.stderr) exit(3) else: print(result.stdout) if args.dataset_file: dataset_file_path = Path(cur_work_dir, args.dataset_file) if not dataset_file_path.parent.exists(): dataset_file_path.parent.mkdir(parents=True) shutil.copy(Path(Config.dataset_file()), dataset_file_path) else: dataset_file_path = Path(Config.dataset_file()) print('dataset was saved to {}'.format(str(dataset_file_path.absolute())))
def preprocess_file(self, scale_ncss=True, scale=False, **kwargs): df = pd.read_csv(Config.dataset_file()) df = df[~df["filename"].str.lower().str.contains("test")] config = Config.get_patterns_config() if self.do_rename_columns: p_codes = \ [x['code'] for x in config['patterns']] \ + ['lines' + x['code'] for x in config['patterns']] m_codes = [x['code'] for x in config['metrics']] keys = p_codes + m_codes vals = \ [x['name'] for x in config['patterns']] \ + ['lines' + x['name'] for x in config['patterns']] \ + [x['name'] for x in config['metrics']] replace_dict = dict(zip(keys, vals)) df = df.rename(replace_dict) df.columns = vals print('Columns renamed:' + df.head()) df = df.dropna().drop_duplicates( subset=df.columns.difference(['filename'])) df = df[(df.ncss > 20) & (df.ncss < 100) & (df.npath_method_avg < 100000.00)].copy().reset_index() df.drop('filename', axis=1, inplace=True) df.drop('index', axis=1, inplace=True) self.target = df[['M4']].values[:, 0] if scale_ncss: new = pd.DataFrame(df[self.only_patterns].values / df['M2'].values.reshape((-1, 1)), columns=self.only_patterns) self.target /= df['M2'].values.reshape(-1) else: new = df[self.only_patterns].copy() if scale: self.input = pd.DataFrame(StandardScaler().fit_transform( new.values), columns=new.columns, index=new.index).values else: self.input = new.values self.feature_order = list(new.columns)
def _calculate_patterns_and_metrics( file_path: str, is_decomposition_requested: bool) -> List[Dict[str, Any]]: results: List[Dict[str, Any]] = [] config = Config.get_patterns_config() patterns_info = [ pattern_info for pattern_info in config["patterns"] if pattern_info["code"] not in config["patterns_exclude"] ] metrics_info = [ metric_info for metric_info in config["metrics"] if metric_info["code"] not in config["metrics_exclude"] ] ast = AST.build_from_javalang(build_ast(file_path)) classes_ast = [ ast.get_subtree(node) for node in ast.get_root().types if node.node_type == ASTNodeType.CLASS_DECLARATION ] for class_ast in classes_ast: components = (decompose_java_class( class_ast, "strong", ignore_getters=True, ignore_setters=True) if is_decomposition_requested else [class_ast]) for index, component_ast in enumerate(components): calculation_result = { "filepath": file_path, "class_name": class_ast.get_root().name, "component_index": index, } for pattern_info in patterns_info: try: pattern = pattern_info["make"]() pattern_result = pattern.value(component_ast) calculation_result[pattern_info["code"]] = len( pattern_result) calculation_result["lines_" + pattern_info["code"]] = pattern_result except Exception as cause: raise FileProcessingError(file_path, pattern_info["name"], cause) for metric_info in metrics_info: try: metric = metric_info["make"]() metric_result = metric.value(component_ast) calculation_result[metric_info["code"]] = metric_result except Exception as cause: raise FileProcessingError(file_path, metric_info["name"], cause) results.append(calculation_result) return results
def calculate_patterns_and_metrics_with_decomposition(file_path: str, args): error_exc = None patterns_to_suppress = args.suppress results_for_components = [] try: config = Config.get_patterns_config() patterns_info = [ pattern_info for pattern_info in config["patterns"] if pattern_info["code"] not in config["patterns_exclude"] ] metrics_info = [ metric_info for metric_info in config["metrics"] if metric_info["code"] not in config["metrics_exclude"] ] ast = AST.build_from_javalang(build_ast(file_path)) classes_ast = [ ast.get_subtree(node) for node in ast.get_root().types if node.node_type == ASTNodeType.CLASS_DECLARATION ] for class_ast in classes_ast: for index, component_ast in enumerate( decompose_java_class(class_ast, "strong")): result_for_component: Dict[Any, Any] = {} code_lines_dict: Dict[Any, Any] = OrderedDict() input_params = OrderedDict() # type: ignore for pattern_info in patterns_info: if pattern_info['code'] in config['patterns_exclude']: continue if pattern_info['code'] in patterns_to_suppress: input_params[pattern_info["code"]] = 0 code_lines_dict["lines_" + pattern_info["code"]] = [] else: pattern = pattern_info["make"]() pattern_result = pattern.value(component_ast) input_params[pattern_info["code"]] = len( pattern_result) code_lines_dict["lines_" + pattern_info["code"]] = pattern_result for metric_info in metrics_info: metric = metric_info["make"]() metric_result = metric.value(component_ast) input_params[metric_info["code"]] = metric_result result_for_component['code_lines_dict'] = code_lines_dict result_for_component['input_params'] = input_params result_for_component['index'] = index results_for_components.append(result_for_component) except Exception as ex: error_exc = ex return results_for_components, error_exc
def inference(input_params: List[int], code_lines_dict, args): """ Find a pattern which has the largest impact on target :param input_params: list if calculated patterns/metrics :param code_lines_dict: list with found code lines of patterns/metrics :return: """ model_path = args.model_file if input_params: if not model_path: model_path = Config.folder_model_data() with open(model_path, 'rb') as fid: model = pickle.load(fid) sorted_result = predict(input_params, model, args) code_lines = None patterns_list = model.features_conf['patterns_only'] pattern_code = None # type: ignore for iter, (key, val) in enumerate(sorted_result.items()): if key in patterns_list: pattern_code = key code_lines = code_lines_dict.get('lines_' + key) # We show only positive gradient, we won't add patterns if code_lines and val > 1.00000e-20: break if code_lines: pattern_name = [ x['name'] for x in Config.get_patterns_config()['patterns'] if x['code'] == pattern_code ][0] else: pattern_name = None pattern_code = None code_lines = [] else: code_lines = [] pattern_code = None # type: ignore pattern_name = None return code_lines, pattern_code, pattern_name
def test( self, files: List[str] ) -> List[List[Union[Union[str, int, list, List[float]], Any]]]: """Make predict for list of java files using current model.""" config = Config.get_patterns_config() patterns_config = config['patterns'] metrics_config = config['metrics'] patterns_codes = [x['code'] for x in patterns_config] metrics_codes = [x['code'] for x in metrics_config] features = self.features_conf['features_order'] results = [] for file in files: row = {} for feature in features: # we need for test to scale snippet # we will calculate it if we do not have M2 in feature_conf found_feature = [ x for x in metrics_config if x['code'] == 'M2' ] row['filename'] = file ncss_val = found_feature[0]['make']().value(file) row['M2'] = ncss_val if feature in patterns_codes: found_feature = [ x for x in patterns_config if x['code'] == feature ] lines = found_feature[0]['make']().value(file) row[feature] = len(lines) results.append(row) elif feature in metrics_codes: found_feature = [ x for x in metrics_config if x['code'] == feature ] val = found_feature[0]['make']().value(file) if val: row[feature] = val results.append(row) else: continue result_array = [] for file_for_file in results: sorted_result, importances = self.predict(file_for_file) result_array.append([ file_for_file['filename'], list(sorted_result.keys()), importances ]) return result_array
def get_patterns_name() -> Dict[Any, Any]: only_patterns = [] patterns_code = [] config = Config.get_patterns_config() for x in config['patterns']: if x['code'] not in config['patterns_exclude']: only_patterns.append(x['name']) patterns_code.append(x['code']) features_number = len(only_patterns) print("Number of features: ", features_number) patterns = {x['code']: x['name'] for x in config['patterns']} metrics = {x['code']: x['name'] for x in config['metrics']} replace_dict = dict(patterns, **metrics) return replace_dict
def generate_fake_dataset() -> pd.DataFrame: config = Config.get_patterns_config() patterns = [x['code'] for x in config['patterns']] metrics = [x['code'] for x in config['metrics']] train_df = pd.DataFrame(columns=patterns) min_rows_for_train = 10 for x in range(min_rows_for_train): p = {p: (x + i) for i, p in enumerate(patterns)} m = {p: (x + i) for i, p in enumerate(metrics)} row = {**p, **m} train_df = train_df.append(row, ignore_index=True) train_df = train_df.astype(float) return train_df
def aibolit_stat(test_csv: pd.DataFrame, model=None) -> pd.DataFrame: if not model: load_model_file = Config.folder_model_data() print('Loading model from file {}:'.format(load_model_file)) with open(load_model_file, 'rb') as fid: model = pickle.load(fid) print('Model has been loaded successfully') scaled_dataset = scale_dataset(test_csv, model.features_conf) cleaned_dataset = scaled_dataset[model.features_conf['features_order'] + ['M2']] ranked, _, acts_complexity, acts = Stats.check_impact( cleaned_dataset.values, model) m, p = Stats.count_acts(acts, ranked) return Stats.get_table(model.features_conf['features_order'], m, p, acts_complexity)
def test_model_training(): cur_file_dir = Path(os.path.realpath(__file__)).parent config = Config.get_patterns_config() model = PatternRankingModel() patterns = [x['code'] for x in config['patterns']] train_df = generate_fake_dataset() model.features_conf = {'features_order': patterns} scaled_df = scale_dataset(train_df, model.features_conf) start = time() print('Start training...') model.fit_regressor(scaled_df[patterns], scaled_df['M4']) end = time() print('End training. Elapsed time: {:.2f} secs'.format(end - start)) # this folder is created by catboost library, impossible to get rid of it catboost_folder = Path(cur_file_dir, 'catboost_info') if catboost_folder.exists(): shutil.rmtree(catboost_folder)
def execute_python_code_in_parallel_thread(exceptions, file_absolute_path): """ This runs in a separate thread. """ file_absolute_path = file_absolute_path.strip() file_path = Path(file_absolute_path) row = {'filename': file_path.absolute().as_posix()} config = Config.get_patterns_config() for pattern in config['patterns']: val = None acronym = pattern['code'] if acronym not in config['patterns_exclude']: try: val = pattern['make']().value(str(file_path)) row[acronym] = len(val) row['lines_' + acronym] = val except Exception: row['lines_' + acronym] = row[acronym] = val exc_type, exc_value, exc_tb = sys.exc_info() row['lines_' + acronym] = row[acronym] = val traceback_str = traceback.format_exc() exceptions[file_absolute_path] = { 'traceback': traceback_str, 'exc_type': str(exc_value), 'pattern_name': pattern['name'], } for metric in config['metrics']: val = None acronym = metric['code'] if acronym not in config['metrics_exclude']: try: val = metric['make']().value(str(file_path)) row[acronym] = val except Exception: exc_type, exc_value, exc_tb = sys.exc_info() row[acronym] = val traceback_str = traceback.format_exc() exceptions[file_absolute_path] = { 'traceback': traceback_str, 'exc_type': str(exc_value), 'pattern_name': metric['name'], } return row
def calculate_patterns_and_metrics(file): code_lines_dict = input_params = {} # type: ignore error_string = None try: config = Config.get_patterns_config() for pattern in config['patterns']: if pattern in config['patterns_exclude']: continue __count_value(pattern, input_params, code_lines_dict, file) for metric in config['metrics']: if metric in config['metrics_exclude']: continue __count_value(metric, input_params, code_lines_dict, file, is_metric=True) except Exception as ex: error_string = str(ex) input_params = [] # type: ignore return input_params, code_lines_dict, error_string
def _create_dataset_writer(file): config = Config.get_patterns_config() patterns_codes = [ pattern["code"] for pattern in config["patterns"] if pattern["code"] not in config["patterns_exclude"] ] metrics_codes = [ metric["code"] for metric in config["metrics"] if metric["code"] not in config["metrics_exclude"] ] fields = \ patterns_codes + \ metrics_codes + \ ["lines_" + code for code in patterns_codes] + \ ["filepath", "class_name", "component_index"] return DictWriter(file, delimiter=";", quotechar='"', quoting=QUOTE_MINIMAL, fieldnames=fields)
def __load_mock_model(self): config = Config.get_patterns_config() patterns = [x['code'] for x in config['patterns']] class MockModel: def predict(self, input: np.array) -> np.array: results = [] for row in input: s = sum(row) radian = math.radians(s) results.append(math.sin(radian)) return np.array(results) class PatternRankingModel: def __init__(self): self.features_conf = { 'features_order': patterns, 'patterns_only': patterns } self.model = MockModel() return PatternRankingModel()
def __init__(self, *args, **kwargs): super(TestRecommendPipeline, self).__init__(*args, **kwargs) self.cur_file_dir = Path(os.path.realpath(__file__)).parent self.config = Config.get_patterns_config()
import os import sys from aibolit.config import Config # TODO: fix all errors in the patterns/metrics and make these lists empty EXCLUDE_PATTERNS = ['P31', 'P32'] EXCLUDE_METRICS = [] current_path: str = os.path.dirname(os.path.realpath(__file__)) for filename in os.listdir(current_path + '/samples'): for pattern in Config.get_patterns_config()['patterns']: if pattern['code'] in EXCLUDE_PATTERNS: continue try: path_to_file = os.path.join(current_path, 'samples', filename) pattern['make']().value(path_to_file) except Exception: print( "Error apply the pattern:", pattern['name'], pattern['code'], "to file", filename ) sys.exit(1) for metric in Config.get_patterns_config()['metrics']: if metric['code'] in EXCLUDE_METRICS: continue try:
try: metric = metric_info["make"]() if metric_info["code"] in METRICS_ACCEPT_FILE_PATH: metric_result = metric.value(filepath) else: ast = AST.build_from_javalang(build_ast(filepath)) metric_result = metric.value(ast) assert isinstance(metric_result, (int, float, np.float64)), ( f"Metric return {metric_result} of type {type(metric_result)}, " "but int, float or numpy float was expected") except Exception as exception: raise RuntimeError( f"Error in application of the metric {metric_info['name']} " f"with code {metric_info['code']} for file {filename}" ) from exception if __name__ == "__main__": config = Config.get_patterns_config() print(f"Processed files in {samples_path}:") for filename in tqdm(listdir(samples_path)): for pattern_info in config["patterns"]: if pattern_info["code"] not in EXCLUDE_PATTERNS: _check_pattern(pattern_info, samples_path / filename) for metric_info in config["metrics"]: if metric_info["code"] not in EXCLUDE_METRICS: _check_metric(metric_info, samples_path / filename)