'ch.uzh.ciclassifier.features.travisci.BuildTimeAverage',
    'ch.uzh.ciclassifier.features.travisci.BuildSuccessRatio',
    'ch.uzh.ciclassifier.features.travisci.BuildTimeLatestAverage',
    'ch.uzh.ciclassifier.features.travisci.ManualInteractionRatio',
    'ch.uzh.ciclassifier.features.travisci.PullRequestRatio',
    'ch.uzh.ciclassifier.features.travisci.TimeToFixAverage',
    'ch.uzh.ciclassifier.features.travisci.TimeToFixLatestAverage',
]

FEATURES_FILE = 'data/truth.csv'
TYPES = ['configuration', 'repository', 'travisci']
results = []
for i in range(len(TYPES)):
    for permutation in list(itertools.combinations(TYPES, i + 1)):
        raw_data = pd.read_csv(FEATURES_FILE)
        features = raw_data[get_features()]

        for feature in AVAILABLE_FEATURES:
            featureType = helper.type_from_feature_name(feature)
            if featureType not in permutation and feature in features:
                features = features.drop(feature, axis=1)

        labels = np.array(raw_data['actual'])
        features = np.array(features)

        cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42)
        rf = RandomForestClassifier(**get_rf_parameters())
        scores = cross_val_score(rf,
                                 features,
                                 labels,
                                 scoring='accuracy',
Пример #2
0
import csv
import pickle
import pandas as pd
from parameters import get_features

TARGET = 'results/ciclassifier_raw_new.csv'
EXPORT = 'results/ciclassifier_new.csv'

MODEL_PATH = '../models/classifier_configuration.sav'
raw_data = pd.read_csv(TARGET)
to_predict = raw_data.drop('project', axis=1)
features = get_features()
features = list(filter(lambda f: "configuration" in f, features))
to_predict = to_predict[features]

model = pickle.load(open(MODEL_PATH, 'rb'))

predictions = model.predict(to_predict)
projects = raw_data.to_dict('records')

with open(EXPORT, 'w', newline='') as csvfile:
    fieldnames = ['project']
    fieldnames.extend(features)
    fieldnames.append('score')
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    index = 0
    for project in projects:
        prediction = predictions[index]
        project['score'] = prediction
Пример #3
0
from scipy.stats import stats

from helper import short_name
from parameters import get_features, beautify_feature_name

DATA = '../data/truth.csv'
raw_data = pd.read_csv(DATA)
good = raw_data.loc[raw_data['actual'] == 1]
bad = raw_data.loc[raw_data['actual'] == 0]

fig, axs = plt.subplots(6, 4, figsize=(10, 12))

index = 0
row = 0

for feature in get_features():
    beautiful_name = beautify_feature_name(feature)
    col = index % 4
    print(row, col)

    tmp_good = good[feature]
    #tmp_good = tmp_good[tmp_good.between(tmp_good.quantile(.00), tmp_good.quantile(.95))]
    # tmp_good = tmp_good[(np.abs(stats.zscore(tmp_good)) < 3)]

    tmp_bad = bad[feature]
    # tmp_bad = tmp_bad[tmp_bad.between(tmp_bad.quantile(.00), tmp_bad.quantile(.95))]
    # tmp_bad = tmp_bad[(np.abs(stats.zscore(tmp_bad)) < 3)]

    #axs[row, col].hist([good[feature],bad[feature]], 20, alpha=0.5)
    #axs[row, col].hist(, 20, facecolor='r', alpha=0.5)
Пример #4
0
    'ch.uzh.ciclassifier.features.travisci.PullRequestRatio',
    'ch.uzh.ciclassifier.features.travisci.TimeToFixAverage',
    'ch.uzh.ciclassifier.features.travisci.TimeToFixLatestAverage',
]

FEATURES_FILE = 'data/truth.csv'
LANGAUGES = ['Ruby', 'JavaScript', 'Python', 'Java', 'C++', 'PHP']

results = []

NUMBER_OF_RUNS = 10

for language in LANGAUGES:
    raw_data = pd.read_csv(FEATURES_FILE)
    subset = raw_data.loc[raw_data['language'] == language]
    features = subset[get_features()]
    labels = np.array(subset['actual'])
    features = np.array(features)

    cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42)
    rf = RandomForestClassifier(**get_rf_parameters())
    scores = cross_val_score(rf,
                             features,
                             labels,
                             scoring='accuracy',
                             cv=cv,
                             n_jobs=-1)

    accuracies = []
    precisions = []
    recalls = []