random_state=42, n_jobs=-1) # Fit the random search model rf_random.fit(features, labels) pprint(rf_random.best_params_) def evaluate(model, test_features, test_labels): predictions = model.predict(test_features) accuracy = accuracy_score(test_labels, predictions) print('Model Performance') print('Accuracy = {:0.2f}%.'.format(accuracy * 100)) return accuracy # Split the data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split( features, labels, test_size=0.25, random_state=43) base_model = RandomForestClassifier(**get_rf_parameters()) base_model.fit(train_features, train_labels) base_accuracy = evaluate(base_model, test_features, test_labels) best_random = rf_random.best_estimator_ random_accuracy = evaluate(best_random, test_features, test_labels) print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))
'ch.uzh.ciclassifier.features.configuration.UseCache', 'ch.uzh.ciclassifier.features.configuration.UseDeploy', # 'ch.uzh.ciclassifier.features.github.OwnerType', # 'ch.uzh.ciclassifier.features.github.PrimaryLanguage', # 'ch.uzh.ciclassifier.features.repository.CommitsUntilConfigAdded', # 'ch.uzh.ciclassifier.features.repository.ConfigChangeFrequency', # 'ch.uzh.ciclassifier.features.repository.DaysUntilConfigAdded', # 'ch.uzh.ciclassifier.features.repository.NumberOfConfigurationFileChanges', # 'ch.uzh.ciclassifier.features.repository.NumberOfContributorsOnConfigurationFile', # 'ch.uzh.ciclassifier.features.repository.ProjectName', # 'ch.uzh.ciclassifier.features.travisci.BuildSuccessRatio', # 'ch.uzh.ciclassifier.features.travisci.BuildTimeAverage', # 'ch.uzh.ciclassifier.features.travisci.BuildTimeLatestAverage', # 'ch.uzh.ciclassifier.features.travisci.ManualInteractionRatio', # 'ch.uzh.ciclassifier.features.travisci.PullRequestRatio', # 'ch.uzh.ciclassifier.features.travisci.TimeToFixAverage', # 'ch.uzh.ciclassifier.features.travisci.TimeToFixLatestAverage', ] # Labels are the values we want to predict labels = np.array(raw_data['actual']) features = raw_data[used_features] # Instantiate model with 1000 decision trees rf = RandomForestClassifier(**get_rf_parameters()) # Train the model on training data rf.fit(features, labels) pickle.dump(rf, open(MODEL_PATH, 'wb'))
FEATURES_FILE = 'data/truth.csv' LANGAUGES = ['Ruby', 'JavaScript', 'Python', 'Java', 'C++', 'PHP'] results = [] NUMBER_OF_RUNS = 10 for language in LANGAUGES: raw_data = pd.read_csv(FEATURES_FILE) subset = raw_data.loc[raw_data['language'] == language] features = subset[get_features()] labels = np.array(subset['actual']) features = np.array(features) cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42) rf = RandomForestClassifier(**get_rf_parameters()) scores = cross_val_score(rf, features, labels, scoring='accuracy', cv=cv, n_jobs=-1) accuracies = [] precisions = [] recalls = [] for run in range(NUMBER_OF_RUNS): raw_data = pd.read_csv(FEATURES_FILE) subset_train = raw_data.loc[raw_data['language'] != language] subset_test = raw_data.loc[raw_data['language'] == language]