#from sklearn.naive_bayes import BernoulliNB, MultinomialNB #from sklearn.neighbors import NearestCentroid from sklearn.grid_search import GridSearchCV from bagging import AsymBaggingRFCs, get_asym_task import classes from features import all_features from models import Repo import utils NGRAM_MIN = 1 NGRAM_MAX = 2 # not inclusive sorted_stdlib_names = sorted(list(utils.stdlib_module_names())) def ngrams(mods): iters = [] for i in range(NGRAM_MIN, NGRAM_MAX): iters.append(combinations(mods, i)) return chain(*iters) def RandomForest(): return RandomForestClassifier( n_estimators=200, max_depth=None, min_samples_split=1,
def get_classifier(X, y): return RandomForestClassifier( n_estimators=100, max_depth=None, min_samples_split=1, random_state=0, # random seed is static for comparison compute_importances=True, ) if __name__ == '__main__': repos = Repo.load_sample() class_to_id, id_to_class = utils.create_bimap(classes.classes) dict_repos = [] for r in repos: d = {mod: False for mod in utils.stdlib_module_names()} for mod in r.imported_stdlib_modules: d[mod] = True dict_repos.append(d) vectorizer = DictVectorizer(sparse=False) y = np.array([class_to_id[classes.classify(r)] for r in repos]) X = vectorizer.fit_transform(dict_repos) clf = get_classifier(X, y) clf.fit(X, y) #function to use when evaluating results score_func = functools.partial(metrics.classification_report,
def imported_stdlib_modules(repo): """Like imported_modules, but only keeps stdlib modules.""" return tuple(mod for mod in repo._calc('imported_modules') if mod in utils.stdlib_module_names())