def __init__(self, sample_dict=None, class_names=None):
        global _sample

        if self.feature_name is None:
            raise Exception('Provide feature_name field in subclass.')

        if sample_dict is None:
            if _sample is None:
                _sample = Repo.load_sample(separate=True)
            sample_dict = _sample

        if class_names is None:
            class_names = sorted(classes)

        self.class_names = class_names
        self.figure, self.ax = plt.subplots()

        hist_data = []
        for clsname in class_names:
            # ie 'class feature data'
            cfd = np.array([getattr(repo, self.feature_name) for repo in sample_dict[clsname]])

            if self.in_class_percentiles is not None:
                min_val, max_val = [np.percentile(cfd, i) for i in self.in_class_percentiles]
                cfd = np.array([e for e in cfd if min_val < e < max_val])

            hist_data.append(cfd)

        self.hist_data = hist_data
import classes
from models import Repo
import utils


def get_classifier(X, y):
    return RandomForestClassifier(
        n_estimators=100, max_depth=None, min_samples_split=1,
        random_state=0,  # random seed is static for comparison
        compute_importances=True,
    )


if __name__ == '__main__':
    repos = Repo.load_sample()

    class_to_id, id_to_class = utils.create_bimap(classes.classes)

    dict_repos = []
    for r in repos:
        d = {mod: False for mod in utils.stdlib_module_names()}

        for mod in r.imported_stdlib_modules:
            d[mod] = True
        dict_repos.append(d)

    vectorizer = DictVectorizer(sparse=False)

    y = np.array([class_to_id[classes.classify(r)] for r in repos])
    X = vectorizer.fit_transform(dict_repos)
def fetch():
    sys.stdout.write('loading')
    sys.stdout.flush()
    repos = Repo.load_sample()
    authors = {author.login: author for author in Author.load(FILE)}

    seen = 0
    total = len(repos)
    failures = []
    last_write = datetime.datetime.now()

    el = Elaborator()

    for repo in repos:
        seen += 1

        if repo.username in authors:
            logging.info("already fetched %s", repo.username)
            continue

        try:
            gh_data = el._gh_request(
                'GET',
                '/users/' + repo.username
            )
        except:
            #loop really needs to keep running
            logging.exception("problem! %s", repo)
            failures.append(repo)
            continue

        authors[repo.username] = Author(**{key: gh_data.get(key, None) for key in
                                           ['login',  # "octocat"
                                            'id',  # 1
                                            'avatar_url',  # "https://github.com/images/error/octocat_happy.gif"
                                            'gravatar_id',  # "somehexcode"
                                            'url',  # "https://api.github.com/users/octocat"
                                            'name',  # "monalisa octocat"
                                            'company',  # "GitHub"
                                            'blog',  # "https://github.com/blog"
                                            'location',  # "San Francisco"
                                            'email',  # "*****@*****.**"
                                            'hireable',  # false
                                            'bio',  # "There once was..."
                                            'public_repos',  # 2
                                            'public_gists',  # 1
                                            'followers',  # 20
                                            'following',  # 0
                                            'html_url',  # "https://github.com/octocat"
                                            'created_at',  # "2008-01-14T04:33:35Z"
                                            'type',  # "User"
                                            ]})

        logging.info("fetched %s", repo.username)

        progress_bar(seen, total)

        since_write = datetime.datetime.now() - last_write

        if since_write > datetime.timedelta(minutes=5):
            sys.stdout.write("\r(writing results)")
            sys.stdout.flush()
            Author.dump(authors.values(), FILE)

            last_write = datetime.datetime.now()

    print  # from progress bar line

    if failures:
        print "%s failures:" % len(failures)
        for f in failures:
            print "  %s" % f
        print

    print 'writing out...'
    Author.dump(authors.values(), FILE)
def calculate(f_to_calc, f_to_overwrite, console, download):
    """Calculate a list of features."""

    sys.stdout.write('loading')
    sys.stdout.flush()
    repos = Repo.load_sample()

    seen = 0
    total = len(repos)
    dl_failures = []
    calc_failures = []
    last_write = datetime.datetime.now()

    if f_to_calc or f_to_overwrite or download:
        for repo in repos:
            seen += 1
            success = True

            if download:
                success = utils.clone(repo)

            if not success:
                dl_failures.append(repo)
                continue

            try:
                if f_to_calc:
                    logging.info("calc: %s", repo)
                    repo.calculate_features(f_to_calc)

                if f_to_overwrite:
                    logging.info("calc: %s", repo)
                    repo.calculate_features(f_to_overwrite, overwrite=True)

                repo._clear_support_features()  # we're done with this repo now
            except:
                print  # from status line
                logging.exception("!problem: %s", repo)
                calc_failures.append(repo)
                print

            progress_bar(seen, total)

            since_write = datetime.datetime.now() - last_write

            if since_write > datetime.timedelta(minutes=5):
                sys.stdout.write("\r(writing results)")
                sys.stdout.flush()
                Repo.write_update(repos)

                last_write = datetime.datetime.now()

    print  # from progress bar line

    if dl_failures:
        print "%s failed to download:" % len(dl_failures)
        for f in dl_failures:
            print "  %s" % f
        print

    if calc_failures:
        print "%s failed during calc:" % len(calc_failures)
        for f in calc_failures:
            print "  %s" % f
        print

    if console:
        message = ('`repos` contains results;\n'
                   'use ^d to write out or `exit()` to cancel')
        code.interact(message, local=locals())

    print 'writing out...'
    Repo.write_update(repos)
    #benchmark(PassiveAggressiveClassifier(n_iter=50), X, y, feature_names)

    #print 'kNN'
    #benchmark(KNeighborsClassifier(n_neighbors=10), X, y, feature_names)

    #print 'SGD'
    #benchmark(SGDClassifier(n_jobs=-1, alpha=.0001, n_iter=np.ceil(10**3), penalty="elasticnet", shuffle=True),
    #          X, y, feature_names)

    #print 'nearest centroid'
    #benchmark(NearestCentroid(), X, y, feature_names)

    #print 'naive bayes'
    #benchmark(MultinomialNB(alpha=.01), X, y, feature_names)

    #print 'naive bayes (bernoulli)'
    #benchmark(BernoulliNB(alpha=.01), X, y, feature_names)

    #classify(X, y, id_to_class, vec)
    # classify(select_by_pca(X, y), y, id_to_class, vec)


if __name__ == '__main__':
    ignore = ['imported_modules']
    #ignore += ['imported_stdlib_modules']

    features = [f for f in all_features if f not in ignore]
    #features = ['imported_stdlib_modules']

    _run(Repo.load_sample(), features)
        for class_name in class_names:
            repos = class_map[class_name]
            feature_data = [getattr(repo, feature_name) for repo in repos]

            class_summaries.append([f(feature_data) for f in funcs])

        feature_summaries[feature_name] = np.array(class_summaries)

    for feature_name, summary in feature_summaries.items():
        print feature_name
        for i, class_name in enumerate(class_names):
            print '  ', class_name
            print '    ', '\t'.join(str(e) for e in summary[i])

        print '-----'
        print


if __name__ == '__main__':
    class_map = Repo.load_sample(separate=True)

    summarize_features(class_map, ['with_stmt_usage',
                                   'compr_usage',
                                   'lambda_usage',
                                   'global_usage',
                                   'gen_exp_usage',
                                   'print_usage',
                                  ], sorted(classes))

    #summarize_imports(class_map)