def experiment(self): DirStructure.mkdir(self.dir_id.experiment_matrices) if not os.path.exists(self.dir_id.matrices): return if not self.bugs: return if not os.path.exists(self.dir_id.experiments): results = dict() with open(self.dir_id.matrices) as f: json_matrix = json.loads(f.read()) for alpha in ExperimentMatrix.ALPHA_RANGE: for matrix_name, influence_data in self.generate_influence_data( alpha): matrix = copy.deepcopy(json_matrix) matrix.update(influence_data) with open( os.path.join(self.dir_id.experiment_matrices, matrix_name + str(alpha)), "w") as f: json.dump(matrix, f) ei = read_json_planning_instance(matrix) ei.diagnose() results.setdefault(matrix_name, dict())[alpha] = Diagnosis_Results( ei.diagnoses, ei.initial_tests, ei.error).metrics with open(self.dir_id.experiments, "w") as f: json.dump(results, f) with open(self.dir_id.experiments) as f: return json.loads(f.read())
def call_graph(self): DirStructure.mkdir(self.get_dir_id().call_graphs) for trace in self.optimized_traces: g = networkx.DiGraph() g.add_edges_from( self.optimized_traces[trace].get_call_graph_edges()) networkx.write_gexf( g, os.path.join(self.get_dir_id().call_graphs, trace + ".gexf"))
def read_bugs_json(project, dir_path=None, index='3'): if dir_path is None: dir_path = os.path.join(r"C:\amirelm\component_importnace\data", project + "_" + index) if not os.path.exists(dir_path): os.mkdir(dir_path) assert dir_path bugs = list( filter(lambda b: b['project'].lower() == project.lower(), json.load(open(BugDotJar.BugDotJar_JSON)))) projects = [] for bug in bugs: commit = bug['commit'] failing_tests = list( map( lambda f: ".".join( list( reversed(f.split()[0].split(':')[0].replace( ')', '').replace('#', '.').split('(')))).lower( ), bug['failing_tests'])) projects.append( BugDotJar( id, failing_tests, DirId(DirStructure(dir_path), bug['jira_id'] + "_" + commit), commit, project)) return projects
def trace(self, trace_failed=False): repo = Repo(self.get_dir_id().clones) DirStructure.mkdir(self.get_dir_id().traces) if self.test_traces: return tests_to_run = list( map(lambda t: ".".join(t.split('.')[:5]) + '*', self.failing_tests)) tests = tests_to_run if trace_failed else None self.clear() traces = list( repo.run_under_jcov(self.get_dir_id().traces, False, instrument_only_methods=True, short_type=True, tests_to_run=tests, check_comp_error=False)) self.test_traces = dict(list(map(lambda t: (t.test_name, t), traces)))
def full_experiment(bug_miner_project_name, dir_base_path=DIR_BASE_PATH, num_processes=NUM_PROCCESSES): dir_path = os.path.join(dir_base_path, bug_miner_project_name) projects = BugMinerReproducer.read_bug_miner_csv(dir_path, bug_miner_project_name) git_path, jira_path = list(filter(lambda x: os.path.basename(x[1]) == bug_miner_project_name, projects_names.values()))[0] os.system("git clone {0} repos\\{1}".format(git_path, bug_miner_project_name)) execute(partial(exec_do_all, dir_path=dir_path, project_name=bug_miner_project_name), projects.keys(), num_processes) execute(partial(exec_training_set, dir_path=dir_path, project_name=bug_miner_project_name), projects.keys(), num_processes) execute(partial(exec_experiment, dir_path=dir_path), projects.keys(), num_processes) Experiment(DirStructure(dir_path)).experiment()
class ExperimentResults(object): def __init__(self, dir_name): self.dir_name = dir_name self.dir_structure = DirStructure(dir_name) self.ids = self.dir_structure.get_marked_ids() def results(self, num_procesess=10): # map(lambda id: FeatureExtraction(DirId(self.dir_structure, id)).get_training_set(), self.ids) main_cmds(num_procesess, list(map(lambda x: [sys.executable, "experiment.py", self.dir_name, x], self.ids))) Experiment(self.dir_structure).experiment()
def read_bug_miner_csv(dir_path, project_name): csv_path = os.path.join(BugMinerReproducer.BUG_MINER_DIR, project_name + ".csv") df = pd.read_csv(csv_path) ans = dict() commits = dict() list( map(lambda x: commits.setdefault(x['parent'], []).append(x), list(map(lambda y: y[1].to_dict(), df.iterrows())))) for bug_data in commits: ans[bug_data] = BugMinerReproducer( bug_data, list(set(map(lambda x: x['testcase'], commits[bug_data]))), DirId(DirStructure(dir_path), bug_data), os.path.join(BugMinerReproducer.BUG_MINER_REPOS_DIR, project_name), list(set(map(lambda x: x['diff'], commits[bug_data]))), list( set( map(lambda x: x['blamed_components'], commits[bug_data]))), list(set(map(lambda x: x['commit'], commits[bug_data])))) return ans
def __init__(self, dir_name): self.dir_name = dir_name self.dir_structure = DirStructure(dir_name) self.ids = self.dir_structure.get_marked_ids()
list( filter(lambda x: x[0] == x[1], zip(test_commits, function_commits)))) REMOVED = '- ' ADDED = '+ ' UNCHANGED = ' ' NOT_IN_INPUT = '? ' def count_type(diff, diff_type): return len(list(filter(lambda d: d.startswith(diff_type), diff))) test_str = "".join(list(map(str, test_commits))) function_str = "".join(list(map(str, function_commits))) diff = list(difflib.ndiff(test_str, function_str)) seq_match = difflib.SequenceMatcher(None, test_str, function_str) match = seq_match.find_longest_match(0, len(test_commits), 0, len(function_commits)) self.features["commits_removed"] = count_type(diff, REMOVED) self.features["commits_added"] = count_type(diff, ADDED) self.features["commits_unchanged"] = count_type(diff, UNCHANGED) self.features["commits_substring"] = match.size self.features["commits_ratio"] = seq_match.ratio() if __name__ == "__main__": # FeatureExtraction(DirId(DirStructure(r"C:\amirelm\component_importnace\data\d4j_lang12"), sys.argv[1])).extract() FeatureExtraction( DirId(DirStructure(r"C:\amirelm\component_importnace\data\d4j_lang12"), sys.argv[1])).get_training_set( DirStructure( r"C:\amirelm\component_importnace\data\d4j_lang12"))
def cross_validation(self): import sklearn.metrics as metrics from sklearn.metrics import get_scorer scores_names = [ 'accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'completeness_score', 'f1', 'f1_macro', 'f1_micro', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_weighted', 'roc_auc', 'v_measure_score' ] metrics_functions = [ metrics.cohen_kappa_score, metrics.hinge_loss, metrics.matthews_corrcoef, metrics.accuracy_score, metrics.f1_score, metrics.hamming_loss, metrics.log_loss, metrics.precision_score, metrics.recall_score, metrics.zero_one_loss, metrics.average_precision_score, metrics.roc_auc_score ] def pr_auc_score(y_true, y_score): """ Generates the Area Under the Curve for precision and recall. """ precision, recall, thresholds = \ metrics.precision_recall_curve(y_true, y_score[:, 1]) return metrics.auc(recall, precision, reorder=True) pr_auc_scorer = metrics.make_scorer(pr_auc_score, greater_is_better=True, needs_proba=True) scoring = {x: get_scorer(x) for x in scores_names} scoring.update( {x.__name__: metrics.make_scorer(x) for x in metrics_functions}) # scoring["pr_auc"] = pr_auc_scorer def tn(y_true, y_pred): return metrics.confusion_matrix(y_true, y_pred)[0, 0] def fp(y_true, y_pred): return metrics.confusion_matrix(y_true, y_pred)[0, 1] def fn(y_true, y_pred): return metrics.confusion_matrix(y_true, y_pred)[1, 0] def tp(y_true, y_pred): return metrics.confusion_matrix(y_true, y_pred)[1, 1] def cost(y_true, y_pred, fp_cost=1, fn_cost=1): return fp(y_true, y_pred) * fp_cost + fn(y_true, y_pred) * fn_cost def mean_squared_error_cost(true_value, pred_value, fp_cost=1, fn_cost=1): # fp is true_value=true and pred_value>0.5 # fn is true_value=false and pred_value<0.5 from numpy import mean squares = [] for t, p in zip(true_value, pred_value): diff = (t - p)**2 if t: diff *= fp_cost else: diff *= fn_cost squares.append(diff) return mean(squares) def mse(y_true, y_pred): return min( metrics.mean_squared_error( [1 if x else 0 for x in y_true], list( map(lambda x: x[0][1 if x[1] else 0], zip(y_pred, y_true)))), metrics.mean_squared_error( [1 if x else 0 for x in y_true], list( map(lambda x: x[0][0 if x[1] else 1], zip(y_pred, y_true))))) def mse_cost(y_true, y_pred, fp_cost=1, fn_cost=1): return min( mean_squared_error_cost([1 if x else 0 for x in y_true], list( map( lambda x: x[0][1 if x[1] else 0], zip(y_pred, y_true))), fp_cost=fp_cost, fn_cost=fn_cost), mean_squared_error_cost([1 if x else 0 for x in y_true], list( map( lambda x: x[0][0 if x[1] else 1], zip(y_pred, y_true))), fp_cost=fp_cost, fn_cost=fn_cost)) def mse1(y_true, y_pred): return max( metrics.mean_squared_error( [1 if x else 0 for x in y_true], list( map(lambda x: x[0][1 if x[1] else 0], zip(y_pred, y_true)))), metrics.mean_squared_error( [1 if x else 0 for x in y_true], list( map(lambda x: x[0][0 if x[1] else 1], zip(y_pred, y_true))))) def mse_cost1(y_true, y_pred, fp_cost=1, fn_cost=1): return max( mean_squared_error_cost([1 if x else 0 for x in y_true], list( map( lambda x: x[0][1 if x[1] else 0], zip(y_pred, y_true))), fp_cost=fp_cost, fn_cost=fn_cost), mean_squared_error_cost([1 if x else 0 for x in y_true], list( map( lambda x: x[0][0 if x[1] else 1], zip(y_pred, y_true))), fp_cost=fp_cost, fn_cost=fn_cost)) scoring.update({ 'tp': metrics.make_scorer(tp), 'tn': metrics.make_scorer(tn), 'fp': metrics.make_scorer(fp), 'fn': metrics.make_scorer(fn) }) scoring.update({ "cost_{0}_{1}".format(*x): metrics.make_scorer(cost, fp_cost=x[0], fn_cost=x[1]) for x in product(range(1, 4), range(1, 4)) }) # scoring.update({"mse_cost_{0}_{1}".format(*x): metrics.make_scorer(mse_cost, fp_cost=x[0], fn_cost=x[1], needs_proba=True) for x in # product(range(1, 4), range(1, 4))}) # scoring.update({"mse1_cost_{0}_{1}".format(*x): metrics.make_scorer(mse_cost1, fp_cost=x[0], fn_cost=x[1], needs_proba=True) for x in # product(range(1, 4), range(1, 4))}) # # scoring["mse"] = metrics.make_scorer(mse, needs_proba=True) # scoring["mse1"] = metrics.make_scorer(mse1, needs_proba=True) scores = cross_validate(self.get_classifier(), self.get_training_featues(), self.get_training_labels(), cv=3, scoring=scoring, return_train_score=True) all_scores = dict() for score in scores: all_scores["{0}_mean".format(score)] = scores[score].mean() all_scores["{0}_std".format(score)] = scores[score].std() with open( os.path.join( DirStructure.mkdir( self.get_dir_id().classification_metrics), self.get_name()), "w") as f: json.dump(all_scores, f)
with open(self.dir_id.experiments) as f: return json.loads(f.read()) @staticmethod def experiment_classifiers(dir_id): from sanity_classify import SanityClassify, StaticClassify, RandomClassify, DoubleSanityClassify from learning_classify import LearningClassify experiment_matrix = ExperimentMatrix(dir_id) list( map(experiment_matrix.add_classifer, SanityClassify.get_all_sanity_classifers(dir_id))) list( map(experiment_matrix.add_classifer, StaticClassify.get_all_static_classifers(dir_id))) list( map(experiment_matrix.add_classifer, RandomClassify.get_all_random_classifers(dir_id))) # list(map(experiment_matrix.add_classifer, DoubleSanityClassify.get_all_double_classifers(dir_id))) list( map(experiment_matrix.add_classifer, LearningClassify.get_all_classifers(dir_id))) experiment_matrix.experiment() if __name__ == "__main__": # ExperimentMatrix.experiment_classifiers(DirId(DirStructure(r"C:\amirelm\component_importnace\data\maven_3"), sys.argv[1])) # exit() ExperimentMatrix.experiment_classifiers( DirId(DirStructure(sys.argv[1]), sys.argv[2])) # Experiment(DirStructure(r"Z:\component_importance\TIKA")).experiment() # pass
def read_commit_db(dir_path, project='Lang'): d4j = json.load(open(D4JReproducer.D4J_JSON)) project_data = dict(list(map(lambda x: (str(x['bugId']), x), filter(lambda x: x['project'].lower() == project.lower(), d4j)))) projects = [] with open(os.path.join(D4JReproducer.D4J_DIR, project, "commit-db")) as f: for id, buggy, fixed, bug_key, bug_url in csv.reader(f): failing_tests = list(map(lambda x: "{0}.{1}".format(x['className'], x['methodName']).strip().lower(), project_data[id]['failingTests'])) projects.append(D4JReproducer.project_class(project)(id, fixed, failing_tests, DirId(DirStructure(dir_path), id))) return projects
def experiment_execute(project, dir_path): ExperimentMatrix.experiment_classifiers(DirId(DirStructure(dir_path), project))