#from sklearn.naive_bayes import BernoulliNB, MultinomialNB
#from sklearn.neighbors import NearestCentroid

from sklearn.grid_search import GridSearchCV


from bagging import AsymBaggingRFCs, get_asym_task
import classes
from features import all_features
from models import Repo
import utils

NGRAM_MIN = 1
NGRAM_MAX = 2  # not inclusive

sorted_stdlib_names = sorted(list(utils.stdlib_module_names()))


def ngrams(mods):
    iters = []
    for i in range(NGRAM_MIN, NGRAM_MAX):
        iters.append(combinations(mods, i))

    return chain(*iters)


def RandomForest():
    return RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        min_samples_split=1,
def get_classifier(X, y):
    return RandomForestClassifier(
        n_estimators=100, max_depth=None, min_samples_split=1,
        random_state=0,  # random seed is static for comparison
        compute_importances=True,
    )


if __name__ == '__main__':
    repos = Repo.load_sample()

    class_to_id, id_to_class = utils.create_bimap(classes.classes)

    dict_repos = []
    for r in repos:
        d = {mod: False for mod in utils.stdlib_module_names()}

        for mod in r.imported_stdlib_modules:
            d[mod] = True
        dict_repos.append(d)

    vectorizer = DictVectorizer(sparse=False)

    y = np.array([class_to_id[classes.classify(r)] for r in repos])
    X = vectorizer.fit_transform(dict_repos)

    clf = get_classifier(X, y)
    clf.fit(X, y)

    #function to use when evaluating results
    score_func = functools.partial(metrics.classification_report,
def imported_stdlib_modules(repo):
    """Like imported_modules, but only keeps stdlib modules."""

    return tuple(mod for mod in repo._calc('imported_modules')
                 if mod in utils.stdlib_module_names())