def __init__(self, sample_dict=None, class_names=None): global _sample if self.feature_name is None: raise Exception('Provide feature_name field in subclass.') if sample_dict is None: if _sample is None: _sample = Repo.load_sample(separate=True) sample_dict = _sample if class_names is None: class_names = sorted(classes) self.class_names = class_names self.figure, self.ax = plt.subplots() hist_data = [] for clsname in class_names: # ie 'class feature data' cfd = np.array([getattr(repo, self.feature_name) for repo in sample_dict[clsname]]) if self.in_class_percentiles is not None: min_val, max_val = [np.percentile(cfd, i) for i in self.in_class_percentiles] cfd = np.array([e for e in cfd if min_val < e < max_val]) hist_data.append(cfd) self.hist_data = hist_data
import classes from models import Repo import utils def get_classifier(X, y): return RandomForestClassifier( n_estimators=100, max_depth=None, min_samples_split=1, random_state=0, # random seed is static for comparison compute_importances=True, ) if __name__ == '__main__': repos = Repo.load_sample() class_to_id, id_to_class = utils.create_bimap(classes.classes) dict_repos = [] for r in repos: d = {mod: False for mod in utils.stdlib_module_names()} for mod in r.imported_stdlib_modules: d[mod] = True dict_repos.append(d) vectorizer = DictVectorizer(sparse=False) y = np.array([class_to_id[classes.classify(r)] for r in repos]) X = vectorizer.fit_transform(dict_repos)
def fetch(): sys.stdout.write('loading') sys.stdout.flush() repos = Repo.load_sample() authors = {author.login: author for author in Author.load(FILE)} seen = 0 total = len(repos) failures = [] last_write = datetime.datetime.now() el = Elaborator() for repo in repos: seen += 1 if repo.username in authors: logging.info("already fetched %s", repo.username) continue try: gh_data = el._gh_request( 'GET', '/users/' + repo.username ) except: #loop really needs to keep running logging.exception("problem! %s", repo) failures.append(repo) continue authors[repo.username] = Author(**{key: gh_data.get(key, None) for key in ['login', # "octocat" 'id', # 1 'avatar_url', # "https://github.com/images/error/octocat_happy.gif" 'gravatar_id', # "somehexcode" 'url', # "https://api.github.com/users/octocat" 'name', # "monalisa octocat" 'company', # "GitHub" 'blog', # "https://github.com/blog" 'location', # "San Francisco" 'email', # "*****@*****.**" 'hireable', # false 'bio', # "There once was..." 'public_repos', # 2 'public_gists', # 1 'followers', # 20 'following', # 0 'html_url', # "https://github.com/octocat" 'created_at', # "2008-01-14T04:33:35Z" 'type', # "User" ]}) logging.info("fetched %s", repo.username) progress_bar(seen, total) since_write = datetime.datetime.now() - last_write if since_write > datetime.timedelta(minutes=5): sys.stdout.write("\r(writing results)") sys.stdout.flush() Author.dump(authors.values(), FILE) last_write = datetime.datetime.now() print # from progress bar line if failures: print "%s failures:" % len(failures) for f in failures: print " %s" % f print print 'writing out...' Author.dump(authors.values(), FILE)
def calculate(f_to_calc, f_to_overwrite, console, download): """Calculate a list of features.""" sys.stdout.write('loading') sys.stdout.flush() repos = Repo.load_sample() seen = 0 total = len(repos) dl_failures = [] calc_failures = [] last_write = datetime.datetime.now() if f_to_calc or f_to_overwrite or download: for repo in repos: seen += 1 success = True if download: success = utils.clone(repo) if not success: dl_failures.append(repo) continue try: if f_to_calc: logging.info("calc: %s", repo) repo.calculate_features(f_to_calc) if f_to_overwrite: logging.info("calc: %s", repo) repo.calculate_features(f_to_overwrite, overwrite=True) repo._clear_support_features() # we're done with this repo now except: print # from status line logging.exception("!problem: %s", repo) calc_failures.append(repo) print progress_bar(seen, total) since_write = datetime.datetime.now() - last_write if since_write > datetime.timedelta(minutes=5): sys.stdout.write("\r(writing results)") sys.stdout.flush() Repo.write_update(repos) last_write = datetime.datetime.now() print # from progress bar line if dl_failures: print "%s failed to download:" % len(dl_failures) for f in dl_failures: print " %s" % f print if calc_failures: print "%s failed during calc:" % len(calc_failures) for f in calc_failures: print " %s" % f print if console: message = ('`repos` contains results;\n' 'use ^d to write out or `exit()` to cancel') code.interact(message, local=locals()) print 'writing out...' Repo.write_update(repos)
#benchmark(PassiveAggressiveClassifier(n_iter=50), X, y, feature_names) #print 'kNN' #benchmark(KNeighborsClassifier(n_neighbors=10), X, y, feature_names) #print 'SGD' #benchmark(SGDClassifier(n_jobs=-1, alpha=.0001, n_iter=np.ceil(10**3), penalty="elasticnet", shuffle=True), # X, y, feature_names) #print 'nearest centroid' #benchmark(NearestCentroid(), X, y, feature_names) #print 'naive bayes' #benchmark(MultinomialNB(alpha=.01), X, y, feature_names) #print 'naive bayes (bernoulli)' #benchmark(BernoulliNB(alpha=.01), X, y, feature_names) #classify(X, y, id_to_class, vec) # classify(select_by_pca(X, y), y, id_to_class, vec) if __name__ == '__main__': ignore = ['imported_modules'] #ignore += ['imported_stdlib_modules'] features = [f for f in all_features if f not in ignore] #features = ['imported_stdlib_modules'] _run(Repo.load_sample(), features)
for class_name in class_names: repos = class_map[class_name] feature_data = [getattr(repo, feature_name) for repo in repos] class_summaries.append([f(feature_data) for f in funcs]) feature_summaries[feature_name] = np.array(class_summaries) for feature_name, summary in feature_summaries.items(): print feature_name for i, class_name in enumerate(class_names): print ' ', class_name print ' ', '\t'.join(str(e) for e in summary[i]) print '-----' print if __name__ == '__main__': class_map = Repo.load_sample(separate=True) summarize_features(class_map, ['with_stmt_usage', 'compr_usage', 'lambda_usage', 'global_usage', 'gen_exp_usage', 'print_usage', ], sorted(classes)) #summarize_imports(class_map)