예제 #1
0
 def experiment(self):
     DirStructure.mkdir(self.dir_id.experiment_matrices)
     if not os.path.exists(self.dir_id.matrices):
         return
     if not self.bugs:
         return
     if not os.path.exists(self.dir_id.experiments):
         results = dict()
         with open(self.dir_id.matrices) as f:
             json_matrix = json.loads(f.read())
         for alpha in ExperimentMatrix.ALPHA_RANGE:
             for matrix_name, influence_data in self.generate_influence_data(
                     alpha):
                 matrix = copy.deepcopy(json_matrix)
                 matrix.update(influence_data)
                 with open(
                         os.path.join(self.dir_id.experiment_matrices,
                                      matrix_name + str(alpha)), "w") as f:
                     json.dump(matrix, f)
                 ei = read_json_planning_instance(matrix)
                 ei.diagnose()
                 results.setdefault(matrix_name,
                                    dict())[alpha] = Diagnosis_Results(
                                        ei.diagnoses, ei.initial_tests,
                                        ei.error).metrics
         with open(self.dir_id.experiments, "w") as f:
             json.dump(results, f)
     with open(self.dir_id.experiments) as f:
         return json.loads(f.read())
예제 #2
0
 def call_graph(self):
     DirStructure.mkdir(self.get_dir_id().call_graphs)
     for trace in self.optimized_traces:
         g = networkx.DiGraph()
         g.add_edges_from(
             self.optimized_traces[trace].get_call_graph_edges())
         networkx.write_gexf(
             g, os.path.join(self.get_dir_id().call_graphs,
                             trace + ".gexf"))
예제 #3
0
 def read_bugs_json(project, dir_path=None, index='3'):
     if dir_path is None:
         dir_path = os.path.join(r"C:\amirelm\component_importnace\data",
                                 project + "_" + index)
         if not os.path.exists(dir_path):
             os.mkdir(dir_path)
     assert dir_path
     bugs = list(
         filter(lambda b: b['project'].lower() == project.lower(),
                json.load(open(BugDotJar.BugDotJar_JSON))))
     projects = []
     for bug in bugs:
         commit = bug['commit']
         failing_tests = list(
             map(
                 lambda f: ".".join(
                     list(
                         reversed(f.split()[0].split(':')[0].replace(
                             ')', '').replace('#', '.').split('(')))).lower(
                             ), bug['failing_tests']))
         projects.append(
             BugDotJar(
                 id, failing_tests,
                 DirId(DirStructure(dir_path),
                       bug['jira_id'] + "_" + commit), commit, project))
     return projects
예제 #4
0
 def trace(self, trace_failed=False):
     repo = Repo(self.get_dir_id().clones)
     DirStructure.mkdir(self.get_dir_id().traces)
     if self.test_traces:
         return
     tests_to_run = list(
         map(lambda t: ".".join(t.split('.')[:5]) + '*',
             self.failing_tests))
     tests = tests_to_run if trace_failed else None
     self.clear()
     traces = list(
         repo.run_under_jcov(self.get_dir_id().traces,
                             False,
                             instrument_only_methods=True,
                             short_type=True,
                             tests_to_run=tests,
                             check_comp_error=False))
     self.test_traces = dict(list(map(lambda t: (t.test_name, t), traces)))
def full_experiment(bug_miner_project_name, dir_base_path=DIR_BASE_PATH, num_processes=NUM_PROCCESSES):
    dir_path = os.path.join(dir_base_path, bug_miner_project_name)
    projects = BugMinerReproducer.read_bug_miner_csv(dir_path, bug_miner_project_name)
    git_path, jira_path = list(filter(lambda x: os.path.basename(x[1]) == bug_miner_project_name, projects_names.values()))[0]
    os.system("git clone {0} repos\\{1}".format(git_path, bug_miner_project_name))
    execute(partial(exec_do_all, dir_path=dir_path, project_name=bug_miner_project_name), projects.keys(), num_processes)
    execute(partial(exec_training_set, dir_path=dir_path, project_name=bug_miner_project_name), projects.keys(), num_processes)
    execute(partial(exec_experiment, dir_path=dir_path), projects.keys(), num_processes)
    Experiment(DirStructure(dir_path)).experiment()
class ExperimentResults(object):
    def __init__(self, dir_name):
        self.dir_name = dir_name
        self.dir_structure = DirStructure(dir_name)
        self.ids = self.dir_structure.get_marked_ids()

    def results(self, num_procesess=10):
        # map(lambda id: FeatureExtraction(DirId(self.dir_structure, id)).get_training_set(), self.ids)
        main_cmds(num_procesess, list(map(lambda x: [sys.executable, "experiment.py", self.dir_name, x], self.ids)))
        Experiment(self.dir_structure).experiment()
예제 #7
0
 def read_bug_miner_csv(dir_path, project_name):
     csv_path = os.path.join(BugMinerReproducer.BUG_MINER_DIR,
                             project_name + ".csv")
     df = pd.read_csv(csv_path)
     ans = dict()
     commits = dict()
     list(
         map(lambda x: commits.setdefault(x['parent'], []).append(x),
             list(map(lambda y: y[1].to_dict(), df.iterrows()))))
     for bug_data in commits:
         ans[bug_data] = BugMinerReproducer(
             bug_data,
             list(set(map(lambda x: x['testcase'], commits[bug_data]))),
             DirId(DirStructure(dir_path), bug_data),
             os.path.join(BugMinerReproducer.BUG_MINER_REPOS_DIR,
                          project_name),
             list(set(map(lambda x: x['diff'], commits[bug_data]))),
             list(
                 set(
                     map(lambda x: x['blamed_components'],
                         commits[bug_data]))),
             list(set(map(lambda x: x['commit'], commits[bug_data]))))
     return ans
 def __init__(self, dir_name):
     self.dir_name = dir_name
     self.dir_structure = DirStructure(dir_name)
     self.ids = self.dir_structure.get_marked_ids()
예제 #9
0
            list(
                filter(lambda x: x[0] == x[1],
                       zip(test_commits, function_commits))))
        REMOVED = '- '
        ADDED = '+ '
        UNCHANGED = '  '
        NOT_IN_INPUT = '? '

        def count_type(diff, diff_type):
            return len(list(filter(lambda d: d.startswith(diff_type), diff)))

        test_str = "".join(list(map(str, test_commits)))
        function_str = "".join(list(map(str, function_commits)))
        diff = list(difflib.ndiff(test_str, function_str))
        seq_match = difflib.SequenceMatcher(None, test_str, function_str)
        match = seq_match.find_longest_match(0, len(test_commits), 0,
                                             len(function_commits))
        self.features["commits_removed"] = count_type(diff, REMOVED)
        self.features["commits_added"] = count_type(diff, ADDED)
        self.features["commits_unchanged"] = count_type(diff, UNCHANGED)
        self.features["commits_substring"] = match.size
        self.features["commits_ratio"] = seq_match.ratio()


if __name__ == "__main__":
    # FeatureExtraction(DirId(DirStructure(r"C:\amirelm\component_importnace\data\d4j_lang12"), sys.argv[1])).extract()
    FeatureExtraction(
        DirId(DirStructure(r"C:\amirelm\component_importnace\data\d4j_lang12"),
              sys.argv[1])).get_training_set(
                  DirStructure(
                      r"C:\amirelm\component_importnace\data\d4j_lang12"))
    def cross_validation(self):
        import sklearn.metrics as metrics
        from sklearn.metrics import get_scorer
        scores_names = [
            'accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score',
            'average_precision', 'completeness_score', 'f1', 'f1_macro',
            'f1_micro', 'f1_weighted', 'fowlkes_mallows_score',
            'homogeneity_score', 'mutual_info_score', 'neg_log_loss',
            'normalized_mutual_info_score', 'precision', 'precision_macro',
            'precision_micro', 'precision_weighted', 'recall', 'recall_macro',
            'recall_micro', 'recall_weighted', 'roc_auc', 'v_measure_score'
        ]
        metrics_functions = [
            metrics.cohen_kappa_score, metrics.hinge_loss,
            metrics.matthews_corrcoef, metrics.accuracy_score,
            metrics.f1_score, metrics.hamming_loss, metrics.log_loss,
            metrics.precision_score, metrics.recall_score,
            metrics.zero_one_loss, metrics.average_precision_score,
            metrics.roc_auc_score
        ]

        def pr_auc_score(y_true, y_score):
            """
            Generates the Area Under the Curve for precision and recall.
            """
            precision, recall, thresholds = \
                metrics.precision_recall_curve(y_true, y_score[:, 1])
            return metrics.auc(recall, precision, reorder=True)

        pr_auc_scorer = metrics.make_scorer(pr_auc_score,
                                            greater_is_better=True,
                                            needs_proba=True)
        scoring = {x: get_scorer(x) for x in scores_names}
        scoring.update(
            {x.__name__: metrics.make_scorer(x)
             for x in metrics_functions})

        # scoring["pr_auc"] = pr_auc_scorer

        def tn(y_true, y_pred):
            return metrics.confusion_matrix(y_true, y_pred)[0, 0]

        def fp(y_true, y_pred):
            return metrics.confusion_matrix(y_true, y_pred)[0, 1]

        def fn(y_true, y_pred):
            return metrics.confusion_matrix(y_true, y_pred)[1, 0]

        def tp(y_true, y_pred):
            return metrics.confusion_matrix(y_true, y_pred)[1, 1]

        def cost(y_true, y_pred, fp_cost=1, fn_cost=1):
            return fp(y_true, y_pred) * fp_cost + fn(y_true, y_pred) * fn_cost

        def mean_squared_error_cost(true_value,
                                    pred_value,
                                    fp_cost=1,
                                    fn_cost=1):
            # fp is true_value=true and pred_value>0.5
            # fn is true_value=false and pred_value<0.5
            from numpy import mean
            squares = []
            for t, p in zip(true_value, pred_value):
                diff = (t - p)**2
                if t:
                    diff *= fp_cost
                else:
                    diff *= fn_cost
                squares.append(diff)
            return mean(squares)

        def mse(y_true, y_pred):
            return min(
                metrics.mean_squared_error(
                    [1 if x else 0 for x in y_true],
                    list(
                        map(lambda x: x[0][1 if x[1] else 0],
                            zip(y_pred, y_true)))),
                metrics.mean_squared_error(
                    [1 if x else 0 for x in y_true],
                    list(
                        map(lambda x: x[0][0 if x[1] else 1],
                            zip(y_pred, y_true)))))

        def mse_cost(y_true, y_pred, fp_cost=1, fn_cost=1):
            return min(
                mean_squared_error_cost([1 if x else 0 for x in y_true],
                                        list(
                                            map(
                                                lambda x: x[0][1
                                                               if x[1] else 0],
                                                zip(y_pred, y_true))),
                                        fp_cost=fp_cost,
                                        fn_cost=fn_cost),
                mean_squared_error_cost([1 if x else 0 for x in y_true],
                                        list(
                                            map(
                                                lambda x: x[0][0
                                                               if x[1] else 1],
                                                zip(y_pred, y_true))),
                                        fp_cost=fp_cost,
                                        fn_cost=fn_cost))

        def mse1(y_true, y_pred):
            return max(
                metrics.mean_squared_error(
                    [1 if x else 0 for x in y_true],
                    list(
                        map(lambda x: x[0][1 if x[1] else 0],
                            zip(y_pred, y_true)))),
                metrics.mean_squared_error(
                    [1 if x else 0 for x in y_true],
                    list(
                        map(lambda x: x[0][0 if x[1] else 1],
                            zip(y_pred, y_true)))))

        def mse_cost1(y_true, y_pred, fp_cost=1, fn_cost=1):
            return max(
                mean_squared_error_cost([1 if x else 0 for x in y_true],
                                        list(
                                            map(
                                                lambda x: x[0][1
                                                               if x[1] else 0],
                                                zip(y_pred, y_true))),
                                        fp_cost=fp_cost,
                                        fn_cost=fn_cost),
                mean_squared_error_cost([1 if x else 0 for x in y_true],
                                        list(
                                            map(
                                                lambda x: x[0][0
                                                               if x[1] else 1],
                                                zip(y_pred, y_true))),
                                        fp_cost=fp_cost,
                                        fn_cost=fn_cost))

        scoring.update({
            'tp': metrics.make_scorer(tp),
            'tn': metrics.make_scorer(tn),
            'fp': metrics.make_scorer(fp),
            'fn': metrics.make_scorer(fn)
        })

        scoring.update({
            "cost_{0}_{1}".format(*x): metrics.make_scorer(cost,
                                                           fp_cost=x[0],
                                                           fn_cost=x[1])
            for x in product(range(1, 4), range(1, 4))
        })
        # scoring.update({"mse_cost_{0}_{1}".format(*x): metrics.make_scorer(mse_cost, fp_cost=x[0], fn_cost=x[1], needs_proba=True) for x in
        #                 product(range(1, 4), range(1, 4))})

        # scoring.update({"mse1_cost_{0}_{1}".format(*x): metrics.make_scorer(mse_cost1, fp_cost=x[0], fn_cost=x[1], needs_proba=True) for x in
        #                 product(range(1, 4), range(1, 4))})
        #
        # scoring["mse"] = metrics.make_scorer(mse, needs_proba=True)
        # scoring["mse1"] = metrics.make_scorer(mse1, needs_proba=True)
        scores = cross_validate(self.get_classifier(),
                                self.get_training_featues(),
                                self.get_training_labels(),
                                cv=3,
                                scoring=scoring,
                                return_train_score=True)
        all_scores = dict()
        for score in scores:
            all_scores["{0}_mean".format(score)] = scores[score].mean()
            all_scores["{0}_std".format(score)] = scores[score].std()
        with open(
                os.path.join(
                    DirStructure.mkdir(
                        self.get_dir_id().classification_metrics),
                    self.get_name()), "w") as f:
            json.dump(all_scores, f)
예제 #11
0
        with open(self.dir_id.experiments) as f:
            return json.loads(f.read())

    @staticmethod
    def experiment_classifiers(dir_id):
        from sanity_classify import SanityClassify, StaticClassify, RandomClassify, DoubleSanityClassify
        from learning_classify import LearningClassify
        experiment_matrix = ExperimentMatrix(dir_id)
        list(
            map(experiment_matrix.add_classifer,
                SanityClassify.get_all_sanity_classifers(dir_id)))
        list(
            map(experiment_matrix.add_classifer,
                StaticClassify.get_all_static_classifers(dir_id)))
        list(
            map(experiment_matrix.add_classifer,
                RandomClassify.get_all_random_classifers(dir_id)))
        # list(map(experiment_matrix.add_classifer, DoubleSanityClassify.get_all_double_classifers(dir_id)))
        list(
            map(experiment_matrix.add_classifer,
                LearningClassify.get_all_classifers(dir_id)))
        experiment_matrix.experiment()


if __name__ == "__main__":
    # ExperimentMatrix.experiment_classifiers(DirId(DirStructure(r"C:\amirelm\component_importnace\data\maven_3"), sys.argv[1]))
    # exit()
    ExperimentMatrix.experiment_classifiers(
        DirId(DirStructure(sys.argv[1]), sys.argv[2]))
    # Experiment(DirStructure(r"Z:\component_importance\TIKA")).experiment()
    # pass
예제 #12
0
 def read_commit_db(dir_path, project='Lang'):
     d4j = json.load(open(D4JReproducer.D4J_JSON))
     project_data = dict(list(map(lambda x: (str(x['bugId']), x), filter(lambda x: x['project'].lower() == project.lower(), d4j))))
     projects = []
     with open(os.path.join(D4JReproducer.D4J_DIR, project, "commit-db")) as f:
         for id, buggy, fixed, bug_key, bug_url in csv.reader(f):
             failing_tests = list(map(lambda x: "{0}.{1}".format(x['className'], x['methodName']).strip().lower(),
                                 project_data[id]['failingTests']))
             projects.append(D4JReproducer.project_class(project)(id, fixed, failing_tests, DirId(DirStructure(dir_path), id)))
     return projects
def experiment_execute(project, dir_path):
    ExperimentMatrix.experiment_classifiers(DirId(DirStructure(dir_path), project))