def create_sub_data_set_by_columns(self, columns, dataset_cols, dir_name, label, names, sub_dir, testing_df, training_df): scores = [] for d in columns: cols = set( filter(lambda dc: any(map(lambda c: c in dc, columns[d])), dataset_cols)) if len(cols) == 0: continue cols.add(label) cols = list(cols) train = training_df[cols] test = testing_df[cols] ci = ClassificationInstance(train, test, names, self.get_dataset_path( os.path.join(dir_name, sub_dir, d)), label=label) try: ci.predict() ci_scores = dict(ci.scores) ci_scores.update({"type": dir_name, "data_type": d}) scores.append(ci_scores) except Exception as e: print(e) return scores
def extract_methods_datasets(self, training_datasets, testing_dataset): training = pd.concat(training_datasets, ignore_index=True).drop("Method_ids", axis=1, errors='ignore') training = self.fillna(training) testing = testing_dataset testing = self.fillna(testing) methods_testing_names = testing.pop("Method_ids").values.tolist() return ClassificationInstance(training, testing, methods_testing_names, self.get_dataset_path("methods"), label="BuggedMethods")
def extract_classes_datasets(self, training_datasets, testing_dataset, sub_dir="classes"): training = pd.concat(training_datasets, ignore_index=True).drop(["File", "Class", "Method_ids"], axis=1, errors='ignore') training = self.fillna(training) testing = testing_dataset.drop(["Method_ids", "Class"], axis=1, errors='ignore') testing = self.fillna(testing, default='') file_names = testing.pop("File").values.tolist() # classes_names = testing.pop("Class").values.tolist() # classes_testing_names = list(map("@".join, zip(file_names, ['' if x in (False, True) else x for x in classes_names]))) return ClassificationInstance(training, testing, file_names, self.get_dataset_path(sub_dir))
def create_all_but_one_dataset(self, data_types): alls = {} ones = {} detailed = {} for d in DataNameEnum: if d.value.data_type.value in data_types: detailed.setdefault(d.value.data_type.value, set()).add(d.value.name) for d in detailed: ones[d] = detailed[d] all_but_d = list(detailed.keys()) all_but_d.remove(d) alls[d] = reduce(set.__or__, list(map(detailed.get, all_but_d)), set()) for sub_dir, label in [("methods", "BuggedMethods"), ("classes", "Bugged")]: scores = [] training_df = pd.read_csv(os.path.join(self.get_dataset_path(sub_dir), "training.csv"), sep=';') testing_df = pd.read_csv(os.path.join(self.get_dataset_path(sub_dir), "testing.csv"), sep=';') dataset_cols = set(training_df.columns.to_list()).intersection(set(testing_df.columns.to_list())) names = pd.read_csv(os.path.join(self.get_dataset_path(sub_dir), "prediction.csv"), sep=';')['name'].to_list() for dir_name, columns in (('one', ones), ('all', alls)): for d in columns: cols = set(filter(lambda dc: any(map(lambda c: c in dc, columns[d])), dataset_cols)) if len(cols) == 0: continue cols.add(label) cols = list(cols) train = training_df[cols] test = testing_df[cols] ci = ClassificationInstance(train, test, names, self.get_dataset_path(os.path.join(dir_name, sub_dir, d)), label=label) try: ci.predict() ci_scores = dict(ci.scores) ci_scores.update({"type": dir_name, "data_type": d}) scores.append(ci_scores) except Exception as e: print(e) pd.DataFrame(scores).to_csv(self.get_dataset_path(sub_dir + "_metrics.csv", False), index=False, sep=';')
def extract_methods_datasets(self, methods_datasets): dataset_dir = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['VERSION_METRICS']['Dataset'], self.project.github())) methods_dataset_dir = os.path.join(dataset_dir, "methods") Path(methods_dataset_dir).mkdir(parents=True, exist_ok=True) methods_training = pd.concat(methods_datasets[:-1], ignore_index=True).drop("Method_ids", axis=1, errors='ignore') methods_testing = methods_datasets[-1] methods_testing_names = methods_testing.pop( "Method_ids").values.tolist() return ClassificationInstance(methods_training, methods_testing, methods_testing_names, methods_dataset_dir, label="BuggedMethods")
def extract_classes_datasets(self, classes_datasets): dataset_dir = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['VERSION_METRICS']['Dataset'], self.project.github())) classes_dataset_dir = os.path.join(dataset_dir, "classes") Path(classes_dataset_dir).mkdir(parents=True, exist_ok=True) classes_training = pd.concat(classes_datasets[:-1], ignore_index=True).drop( ["File", "Class", "Method_ids"], axis=1, errors='ignore') classes_testing = classes_datasets[-1].drop("Method_ids", axis=1, errors='ignore') file_names = classes_testing.pop("File").values.tolist() classes_names = classes_testing.pop("Class").values.tolist() classes_testing_names = list( map("@".join, zip(file_names, classes_names))) return ClassificationInstance(classes_training, classes_testing, classes_testing_names, classes_dataset_dir)
def create_all_but_one_dataset(self, data_types): alls = {} ones = {} detailed = {} for d in data_types: detailed[d] = [] for d in DataNameEnum: data_type = d.value.data_type.value if data_type in data_types: detailed[data_type].append(d.value.name) for d in detailed: ones[d] = set(detailed[d]) alls[d] = reduce( set.__or__, list( map(lambda x: set(detailed.get(x)), filter(lambda x: x != d, detailed.keys()))), set()) dir_labels = [] if self.quick_mode: dir_labels = [("classes", "bugged_Bugged")] else: dir_labels = [("methods", "bugged_methods_BuggedMethods"), ("classes", "bugged_Bugged")] for sub_dir, label in dir_labels: scores = [] training_df = pd.read_csv(os.path.join( self.get_dataset_path(sub_dir), "training.csv"), sep=';') testing_df = pd.read_csv(os.path.join( self.get_dataset_path(sub_dir), "testing.csv"), sep=';') names = pd.read_csv(os.path.join(self.get_dataset_path(sub_dir), "prediction.csv"), sep=';')['name'].to_list() ci = ClassificationInstance(training_df, testing_df, names, self.get_dataset_path(sub_dir), label=label, save_all=False) try: ci.predict() ci_scores = dict(ci.scores) ci_scores.update({ "type": "all_feature", "data_type": "all_feature" }) scores.append(ci_scores) except Exception as e: print(e) for dir_name, columns in (('one', ones), ('all', alls)): training_df = pd.read_csv(os.path.join( self.get_dataset_path(sub_dir), "training.csv"), sep=';') testing_df = pd.read_csv(os.path.join( self.get_dataset_path(sub_dir), "testing.csv"), sep=';') dataset_cols = set(training_df.columns.to_list()).intersection( set(testing_df.columns.to_list())) names = pd.read_csv(os.path.join( self.get_dataset_path(sub_dir), "prediction.csv"), sep=';')['name'].to_list() ans = self.create_sub_data_set_by_columns( columns, dataset_cols, dir_name, label, names, sub_dir, testing_df, training_df) if ans: scores.extend(ans) pd.DataFrame(scores).to_csv(self.get_dataset_path( sub_dir + "_metrics.csv", False), index=False, sep=';')