def __init__(self, lemmatization=False, granularity="label", failures_skip=None): Model.__init__(self, lemmatization) self.granularity = granularity self.failures_skip = failures_skip self.training_dbs = [repository.COMMITS_DB] self.eval_dbs[repository.COMMITS_DB] = ( repository.COMMITS_DB, repository.COMMIT_EXPERIENCES_DB, ) if granularity == "label": self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_LABEL_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_LABEL_DB, test_scheduling.FAILING_TOGETHER_LABEL_DB, ) elif granularity == "group": self.training_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_GROUP_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) self.cross_validation_enabled = False self.entire_dataset_training = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ test_scheduling_features.prev_failures(), ] if granularity == "label": feature_extractors += [ test_scheduling_features.platform(), # test_scheduling_features.chunk(), test_scheduling_features.suite(), ] elif granularity == "group": feature_extractors += [ test_scheduling_features.path_distance(), test_scheduling_features.common_path_components(), test_scheduling_features.touched_together(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, interpretable=False): CommitModel.__init__(self, lemmatization) self.required_dbs.append(BUG_INTRODUCING_COMMITS_DB) self.store_dataset = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.functions_touched_num(), commit_features.functions_touched_size(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] column_transformers = [("data", DictVectorizer(), "data")] if not interpretable: column_transformers.append( ("desc", self.text_vectorizer(min_df=0.0001), "desc")) self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, cleanup_functions), ), ("union", ColumnTransformer(column_transformers)), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization: bool = False) -> None: CommitModel.__init__(self, lemmatization) self.calculate_importance = False self.training_dbs += [bugzilla.BUGS_DB] self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.reviewers_num(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.functions_touched_num(), commit_features.functions_touched_size(), commit_features.source_code_file_metrics(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(min_df=0.0001), "desc"), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): CommitModel.__init__(self, lemmatization) self.calculate_importance = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.file_size(), commit_features.test_added(), commit_features.added(), commit_features.deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.files(), commit_features.files_modified_num(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(), "desc"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, granularity="label"): Model.__init__(self, lemmatization) self.granularity = granularity self.required_dbs = [repository.COMMITS_DB] if granularity == "label": self.required_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) elif granularity == "group": self.required_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB) self.cross_validation_enabled = False self.entire_dataset_training = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), test_scheduling_features.name(), test_scheduling_features.prev_failures(), ] if granularity == "label": feature_extractors += [ test_scheduling_features.platform(), test_scheduling_features.chunk(), test_scheduling_features.suite(), ] elif granularity == "group": feature_extractors += [ test_scheduling_features.path_distance(), test_scheduling_features.common_path_components(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): CommitModel.__init__(self, lemmatization) self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), # commit_features.author_experience(), # commit_features.reviewer_experience(), commit_features.reviewers_num(), # commit_features.component_touched_prev(), # commit_features.directory_touched_prev(), # commit_features.file_touched_prev(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), ] self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False): Model.__init__(self, lemmatization) self.required_dbs = [ repository.COMMITS_DB, test_scheduling.TEST_SCHEDULING_DB ] self.calculate_importance = False self.cross_validation_enabled = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), test_scheduling_features.name(), test_scheduling_features.platform(), test_scheduling_features.chunk(), test_scheduling_features.suite(), test_scheduling_features.prev_failures(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, bug_data=False): CommitModel.__init__(self, lemmatization, bug_data) self.calculate_importance = False self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.components(), commit_features.directories(), commit_features.files(), ] if bug_data: feature_extractors += [ bug_features.product(), bug_features.component(), bug_features.severity(), bug_features.priority(), bug_features.has_crash_signature(), bug_features.has_regression_range(), bug_features.whiteboard(), bug_features.keywords(), bug_features.number_of_bug_dependencies(), bug_features.blocked_bugs_number(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("desc", self.text_vectorizer(), "desc"), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__( self, lemmatization: bool = False, interpretable: bool = True, use_finder: bool = False, exclude_finder: bool = True, finder_regressions_only: bool = False, ) -> None: CommitModel.__init__(self, lemmatization) self.training_dbs += [BUG_INTRODUCING_COMMITS_DB, bugzilla.BUGS_DB] if finder_regressions_only: self.training_dbs.append(BUG_FIXING_COMMITS_DB) self.store_dataset = True self.sampler = RandomUnderSampler(random_state=0) self.use_finder = use_finder self.exclude_finder = exclude_finder assert ( use_finder ^ exclude_finder ), "Using both use_finder and exclude_finder option does not make a lot of sense" self.finder_regressions_only = finder_regressions_only feature_extractors = [ commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), commit_features.author_experience(), commit_features.reviewer_experience(), commit_features.reviewers_num(), commit_features.component_touched_prev(), commit_features.directory_touched_prev(), commit_features.file_touched_prev(), commit_features.types(), commit_features.files(), commit_features.components(), commit_features.components_modified_num(), commit_features.directories(), commit_features.directories_modified_num(), commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.functions_touched_num(), commit_features.functions_touched_size(), commit_features.source_code_file_metrics(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] column_transformers = [("data", DictVectorizer(), "data")] if not interpretable: column_transformers.append( ("desc", self.text_vectorizer(min_df=0.0001), "desc") ) self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor( feature_extractors, cleanup_functions ), ), ("union", ColumnTransformer(column_transformers)), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, granularity="label", use_subset=False): Model.__init__(self, lemmatization) self.granularity = granularity # This is useful for development purposes, it avoids using too much memory # by using a subset of the dataset (dropping some passing runnables). self.use_subset = use_subset self.training_dbs = [repository.COMMITS_DB] self.eval_dbs[repository.COMMITS_DB] = ( repository.COMMITS_DB, repository.COMMIT_EXPERIENCES_DB, ) if granularity == "label": self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_LABEL_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_LABEL_DB, test_scheduling.FAILING_TOGETHER_LABEL_DB, ) elif granularity == "group": self.training_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_GROUP_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) self.cross_validation_enabled = False self.entire_dataset_training = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ test_scheduling_features.prev_failures(), ] if granularity == "label": feature_extractors += [ test_scheduling_features.platform(), # test_scheduling_features.chunk(), test_scheduling_features.suite(), ] elif granularity == "group": feature_extractors += [ commit_features.source_code_files_modified_num(), commit_features.other_files_modified_num(), commit_features.test_files_modified_num(), commit_features.source_code_file_size(), commit_features.other_file_size(), commit_features.test_file_size(), commit_features.source_code_added(), commit_features.other_added(), commit_features.test_added(), commit_features.source_code_deleted(), commit_features.other_deleted(), commit_features.test_deleted(), test_scheduling_features.path_distance(), test_scheduling_features.common_path_components(), test_scheduling_features.touched_together(), ] self.extraction_pipeline = Pipeline([ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")