def algorithm_analysis():
    """
    Prepares a classifier based on non-validated data, and evaluates its performance in the valdated
    portion of the dataset.
    :return: Dictionary of classifiers
    """

    raw_dataframe = preprocessing.load_original_dataframe()
    issues_train, issues_train_std, priorities_train, issues_test, issues_test_std, priorities_test = get_training_datasets(
        raw_dataframe)

    unfiltered_dataframe = preprocessing.filter_issues_dataframe(raw_dataframe, priority_changer=False)
    filtered_dataframe = preprocessing.filter_issues_dataframe(raw_dataframe, priority_changer=True)

    train, test = train_test_split(unfiltered_dataframe, test_size=0.2, random_state=0)
    test_valid = preprocessing.filter_issues_dataframe(test, priority_changer=True)
    issues_test_valid, issues_test_valid_std, priorities_test_valid = prepare_for_classification(test_valid,
                                                                                                 issues_train)
    issues_valid, issues_valid_std, priorities_valid = prepare_for_classification(filtered_dataframe, issues_train)

    results = []
    estimators = {}
    for algorithm, grid_search in selector.get_algorithms():
        training_set = issues_train_std
        test_set = issues_test_std
        test_valid_set = issues_test_valid_std
        valid_set = issues_valid_std

        print "Current algorithm: ", algorithm

        if algorithm == "RandomForest":
            training_set = issues_train
            test_set = issues_test
            test_valid_set = issues_test_valid
            valid_set = issues_valid

        optimal_estimator, best_params = tuning.parameter_tuning(grid_search, training_set,
                                                                 priorities_train)
        estimators[algorithm] = optimal_estimator

        result = selector.analyse_performance(optimal_estimator, best_params, grid_search, algorithm, training_set,
                                              priorities_train,
                                              test_set,
                                              priorities_test)

        print "Evaluating on the valid portion of the test dataset ..."

        assigner.evaluate_performance(prefix=algorithm, classifier=optimal_estimator,
                                      issues_test_std=test_valid_set, priority_test=priorities_test_valid)

        print "Evaluating on the complete valid dataset ..."

        assigner.evaluate_performance(prefix=algorithm, classifier=optimal_estimator,
                                      issues_test_std=valid_set, priority_test=priorities_valid)

        if result:
            results.append(result + ("ALL", len(unfiltered_dataframe.index)))

    selector.write_results("all_experiment_results.csv", results)
    return estimators
def predict_priority():
    """
    Trains a classifier and runs it on a dataset.
    :return:
    """
    # This values are product of the experiments
    best_class_label = "Severe"

    best_estimators = 51
    best_depth = 21
    classifier = RandomForestClassifier(n_estimators=best_estimators, max_depth=best_depth, random_state=0,
                                        n_jobs=-1)

    # classifier = SVC(kernel='linear', C=1.0, class_weight='balanced')

    target_dataframe = preprocessing.load_original_dataframe()
    training_dataframe = preprocessing.filter_issues_dataframe(target_dataframe)

    issues_training, labels_traning = preprocessing.encode_and_split(issues_dataframe=training_dataframe,
                                                                     class_label=best_class_label,
                                                                     numerical_features=preprocessing.NUMERICAL_FEATURES,
                                                                     nominal_features=[],
                                                                     text_feature=None)

    assigner.train_and_predict(classifier, target_dataframe, issues_training, labels_traning, best_class_label,
                               preprocessing.NUMERICAL_FEATURES, [])
def predict_priority():
    """
    Trains a classifier and runs it on a dataset.
    :return:
    """
    # This values are product of the experiments
    best_class_label = "Severe"

    best_estimators = 51
    best_depth = 21
    classifier = RandomForestClassifier(n_estimators=best_estimators,
                                        max_depth=best_depth,
                                        random_state=0,
                                        n_jobs=-1)

    # classifier = SVC(kernel='linear', C=1.0, class_weight='balanced')

    target_dataframe = preprocessing.load_original_dataframe()
    training_dataframe = preprocessing.filter_issues_dataframe(
        target_dataframe)

    issues_training, labels_traning = preprocessing.encode_and_split(
        issues_dataframe=training_dataframe,
        class_label=best_class_label,
        numerical_features=preprocessing.NUMERICAL_FEATURES,
        nominal_features=[],
        text_feature=None)

    assigner.train_and_predict(classifier, target_dataframe, issues_training,
                               labels_traning, best_class_label,
                               preprocessing.NUMERICAL_FEATURES, [])
Exemplo n.º 4
0
def main():
    original_dataframe = preprocessing.load_original_dataframe()
    issues_dataframe = preprocessing.filter_issues_dataframe(
        original_dataframe)

    # Plotting projects
    figure, axes = plt.subplots(1, 1)
    issues_dataframe['Git Repository'].value_counts(normalize=True).plot(
        kind='bar', ax=axes)
    plt.show()

    issues_dataframe, encoded_priorities = preprocessing.encode_and_split(
        issues_dataframe, preprocessing.CLASS_LABEL,
        preprocessing.NUMERICAL_FEATURES, preprocessing.NOMINAL_FEATURES)

    # Plotting priorities

    figure, axes = plt.subplots(1, 1)
    encoded_priorities.value_counts(normalize=True, sort=True).plot(kind='bar',
                                                                    ax=axes)
    plt.show()

    issues_train, issues_test, priority_train, priority_test = train_test_split(
        issues_dataframe, encoded_priorities, test_size=0.2, random_state=0)

    print len(issues_train.index), " issues on the train set."

    issues_train_std, issues_test_std = preprocessing.escale_numerical_features(
        preprocessing.NUMERICAL_FEATURES, issues_train, issues_test)

    logit_classifier = select_features_l1(issues_train_std, priority_train,
                                          issues_test_std, priority_test)
    knn_classifier = sequential_feature_selection(issues_train_std,
                                                  priority_train,
                                                  issues_test_std,
                                                  priority_test)

    print "Building Random Forest Classifier ..."
    rforest_classifier = RandomForestClassifier(n_estimators=10000,
                                                random_state=0,
                                                n_jobs=-1)
    rforest_classifier.fit(issues_train, priority_train)
    forest_classifier = feature_importance_with_forest(issues_train,
                                                       priority_train,
                                                       issues_test,
                                                       priority_test)

    rforest_classifier = RandomForestClassifier(n_estimators=10000,
                                                random_state=0,
                                                n_jobs=-1)

    train_and_predict(rforest_classifier, original_dataframe, issues_dataframe,
                      encoded_priorities, preprocessing.CLASS_LABEL,
                      preprocessing.NUMERICAL_FEATURES,
                      preprocessing.NOMINAL_FEATURES)
def main():
    """
    Initial execution point.
    :return: None.
    """

    best_forest = algorithm_analysis()["RandomForest"]

    raw_dataframe = preprocessing.load_original_dataframe()
    issues_train, issues_train_std, priorities_train, issues_test, issues_test_std, priorities_test = get_training_datasets(
        raw_dataframe)

    assigner.feature_importance_with_forest(best_forest, issues_train, priorities_train, issues_test, priorities_test)
def main():
    """
    Initial execution point.
    :return: None.
    """

    best_forest = algorithm_analysis()["RandomForest"]

    raw_dataframe = preprocessing.load_original_dataframe()
    issues_train, issues_train_std, priorities_train, issues_test, issues_test_std, priorities_test = get_training_datasets(
        raw_dataframe)

    assigner.feature_importance_with_forest(best_forest, issues_train,
                                            priorities_train, issues_test,
                                            priorities_test)
Exemplo n.º 7
0
def main():
    """
    Initial execution point
    :return: None
    """

    original_dataframe = preprocessing.load_original_dataframe()
    repositories = get_all_repositories(original_dataframe)

    results = []

    for repository in repositories:
        print "Working on repository ", repository, " ..."

        project_dataframe = preprocessing.filter_issues_dataframe(
            original_dataframe, repository=repository)

        # Threslhold taking into account considering the scikit-learn cheat sheet
        # http://scikit-learn.org/stable/tutorial/machine_learning_map/

        minimum_threshold = 50
        issues_found = len(project_dataframe.index)
        print issues_found, " issues found on repository ", repository

        if issues_found > minimum_threshold:

            # The Git Repository feauture is not needed since it is filtered.
            nominal_features = []
            issues, priorities = preprocessing.encode_and_split(
                project_dataframe, assigner.CLASS_LABEL,
                assigner.NUMERICAL_FEATURES, nominal_features)

            train_test = preprocessing.train_test_encode(
                repository, issues, priorities)
            if train_test:
                issues_train_std, priority_train, issues_test_std, priority_test = train_test
                result = run_algorithm_analysis(issues_train_std,
                                                priority_train,
                                                issues_test_std, priority_test,
                                                repository, issues_found)

                if result:
                    results.extend(result)

        else:
            print "Issues corresponding to repository ", repository, " are not enough for analysis."

    write_results("project_experiment_results.csv", results)
def algorithm_analysis():
    """
    Executes the analysis for finding the optimal class label and algorithm configuration.
    :return: None.
    """
    consolidated_results = []

    try:
        minimum_records = 50
        class_labels = [
            'Severe'
            # 'Blocker'
            # , 'Non-Severe', 'Trivial', 'Critical'
        ]

        original_dataframe = preprocessing.load_original_dataframe()

        repositories = []
        # valid_dataframe = preprocessing.filter_issues_dataframe(original_dataframe)
        # repositories.append(("VALID", valid_dataframe))

        for repo_name in selector.get_all_repositories(original_dataframe):
            project_dataframe = preprocessing.filter_issues_dataframe(
                original_dataframe, repository=repo_name)
            repositories.append((repo_name, project_dataframe))

        for class_label in class_labels:
            for repository_name, dataframe in repositories:
                print "Using ", class_label, " as the class feature."
                print "Working on repository ", repository_name, " with ", len(
                    dataframe.index), "Issues"

                if len(dataframe.index) >= minimum_records:
                    results = execute_analysis(
                        dataframe, class_label + "-" + repository_name,
                        class_label)
                    consolidated_results.extend(results)
                else:
                    print "Not enough issues for analysis: ", len(
                        dataframe.index)

    finally:
        selector.write_results("Binary_Classification.csv",
                               consolidated_results)
        winsound.Beep(2500, 1000)
Exemplo n.º 9
0
def main():
    """
    Initial execution point
    :return: None
    """

    original_dataframe = preprocessing.load_original_dataframe()
    repositories = get_all_repositories(original_dataframe)

    results = []

    for repository in repositories:
        print "Working on repository ", repository, " ..."

        project_dataframe = preprocessing.filter_issues_dataframe(original_dataframe, repository=repository)

        # Threslhold taking into account considering the scikit-learn cheat sheet
        # http://scikit-learn.org/stable/tutorial/machine_learning_map/

        minimum_threshold = 50
        issues_found = len(project_dataframe.index)
        print issues_found, " issues found on repository ", repository

        if issues_found > minimum_threshold:

            # The Git Repository feauture is not needed since it is filtered.
            nominal_features = []
            issues, priorities = preprocessing.encode_and_split(project_dataframe, assigner.CLASS_LABEL,
                                                                assigner.NUMERICAL_FEATURES, nominal_features)

            train_test = preprocessing.train_test_encode(repository, issues, priorities)
            if train_test:
                issues_train_std, priority_train, issues_test_std, priority_test = train_test
                result = run_algorithm_analysis(issues_train_std, priority_train, issues_test_std, priority_test,
                                                repository, issues_found)

                if result:
                    results.extend(result)

        else:
            print "Issues corresponding to repository ", repository, " are not enough for analysis."

    write_results("project_experiment_results.csv", results)
Exemplo n.º 10
0
def main():
    original_dataframe = preprocessing.load_original_dataframe()
    issues_dataframe = preprocessing.filter_issues_dataframe(original_dataframe)

    # Plotting projects
    figure, axes = plt.subplots(1, 1)
    issues_dataframe['Git Repository'].value_counts(normalize=True).plot(kind='bar', ax=axes)
    plt.show()

    issues_dataframe, encoded_priorities = preprocessing.encode_and_split(issues_dataframe, preprocessing.CLASS_LABEL,
                                                                          preprocessing.NUMERICAL_FEATURES,
                                                                          preprocessing.NOMINAL_FEATURES)

    # Plotting priorities

    figure, axes = plt.subplots(1, 1)
    encoded_priorities.value_counts(normalize=True, sort=True).plot(kind='bar', ax=axes)
    plt.show()

    issues_train, issues_test, priority_train, priority_test = train_test_split(issues_dataframe,
                                                                                encoded_priorities,
                                                                                test_size=0.2, random_state=0)

    print len(issues_train.index), " issues on the train set."

    issues_train_std, issues_test_std = preprocessing.escale_numerical_features(preprocessing.NUMERICAL_FEATURES,
                                                                                issues_train,
                                                                                issues_test)

    logit_classifier = select_features_l1(issues_train_std, priority_train, issues_test_std, priority_test)
    knn_classifier = sequential_feature_selection(issues_train_std, priority_train, issues_test_std, priority_test)

    print "Building Random Forest Classifier ..."
    rforest_classifier = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
    rforest_classifier.fit(issues_train, priority_train)
    forest_classifier = feature_importance_with_forest(issues_train, priority_train, issues_test, priority_test)

    rforest_classifier = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

    train_and_predict(rforest_classifier, original_dataframe, issues_dataframe, encoded_priorities,
                      preprocessing.CLASS_LABEL,
                      preprocessing.NUMERICAL_FEATURES,
                      preprocessing.NOMINAL_FEATURES)
Exemplo n.º 11
0
def algorithm_analysis():
    """
    Executes the analysis for finding the optimal class label and algorithm configuration.
    :return: None.
    """
    consolidated_results = []

    try:
        minimum_records = 50
        class_labels = [
            'Severe'
            # 'Blocker'
            # , 'Non-Severe', 'Trivial', 'Critical'
        ]

        original_dataframe = preprocessing.load_original_dataframe()

        repositories = []
        # valid_dataframe = preprocessing.filter_issues_dataframe(original_dataframe)
        # repositories.append(("VALID", valid_dataframe))

        for repo_name in selector.get_all_repositories(original_dataframe):
            project_dataframe = preprocessing.filter_issues_dataframe(original_dataframe,
                                                                      repository=repo_name)
            repositories.append((repo_name, project_dataframe))

        for class_label in class_labels:
            for repository_name, dataframe in repositories:
                print "Using ", class_label, " as the class feature."
                print "Working on repository ", repository_name, " with ", len(dataframe.index), "Issues"

                if len(dataframe.index) >= minimum_records:
                    results = execute_analysis(dataframe, class_label + "-" + repository_name, class_label)
                    consolidated_results.extend(results)
                else:
                    print "Not enough issues for analysis: ", len(dataframe.index)

    finally:
        selector.write_results("Binary_Classification.csv", consolidated_results)
        winsound.Beep(2500, 1000)
def algorithm_analysis():
    """
    Prepares a classifier based on non-validated data, and evaluates its performance in the valdated
    portion of the dataset.
    :return: Dictionary of classifiers
    """

    raw_dataframe = preprocessing.load_original_dataframe()
    issues_train, issues_train_std, priorities_train, issues_test, issues_test_std, priorities_test = get_training_datasets(
        raw_dataframe)

    unfiltered_dataframe = preprocessing.filter_issues_dataframe(
        raw_dataframe, priority_changer=False)
    filtered_dataframe = preprocessing.filter_issues_dataframe(
        raw_dataframe, priority_changer=True)

    train, test = train_test_split(unfiltered_dataframe,
                                   test_size=0.2,
                                   random_state=0)
    test_valid = preprocessing.filter_issues_dataframe(test,
                                                       priority_changer=True)
    issues_test_valid, issues_test_valid_std, priorities_test_valid = prepare_for_classification(
        test_valid, issues_train)
    issues_valid, issues_valid_std, priorities_valid = prepare_for_classification(
        filtered_dataframe, issues_train)

    results = []
    estimators = {}
    for algorithm, grid_search in selector.get_algorithms():
        training_set = issues_train_std
        test_set = issues_test_std
        test_valid_set = issues_test_valid_std
        valid_set = issues_valid_std

        print "Current algorithm: ", algorithm

        if algorithm == "RandomForest":
            training_set = issues_train
            test_set = issues_test
            test_valid_set = issues_test_valid
            valid_set = issues_valid

        optimal_estimator, best_params = tuning.parameter_tuning(
            grid_search, training_set, priorities_train)
        estimators[algorithm] = optimal_estimator

        result = selector.analyse_performance(optimal_estimator, best_params,
                                              grid_search, algorithm,
                                              training_set, priorities_train,
                                              test_set, priorities_test)

        print "Evaluating on the valid portion of the test dataset ..."

        assigner.evaluate_performance(prefix=algorithm,
                                      classifier=optimal_estimator,
                                      issues_test_std=test_valid_set,
                                      priority_test=priorities_test_valid)

        print "Evaluating on the complete valid dataset ..."

        assigner.evaluate_performance(prefix=algorithm,
                                      classifier=optimal_estimator,
                                      issues_test_std=valid_set,
                                      priority_test=priorities_valid)

        if result:
            results.append(result + ("ALL", len(unfiltered_dataframe.index)))

    selector.write_results("all_experiment_results.csv", results)
    return estimators