Exemplo n.º 1
0
def details_pp(prm):
    # list of searches for a specific type of pre-processing
    dataset_id, process = prm.split(';')
    dataset = get_dataset(dataset_id)
    search = get_search_rounds(dataset.dataset_id)
    cols, best = get_best_details_pp(search, process)
    best = best.to_dict(orient='records')[:10]
    return render_template('details_pp.html',
                           dataset=dataset,
                           process=process,
                           best=best,
                           cols=cols,
                           refresher=int(time.time()),
                           config=get_config())
Exemplo n.º 2
0
def details(prm):
    # list of searches for a specific type of model
    dataset_id, model = prm.split(';')
    dataset = get_dataset(dataset_id)
    search = get_search_rounds(dataset.dataset_id)
    cols, best = get_best_details(search, model)
    best = best.to_dict(orient='records')[:10]
    return render_template('details.html',
                           dataset=dataset,
                           model=model,
                           best=best,
                           cols=cols,
                           refresher=int(time.time()),
                           config=get_config())
Exemplo n.º 3
0
def round(prm):
    # details of a search round (1 pre-processing + 1 model configuration)
    dataset_id, round_id = prm.split(';')
    dataset = get_dataset(dataset_id)
    search = get_search_rounds(dataset.dataset_id)
    round = search[search.round_id == int(round_id)].to_dict(
        orient='records')[0]
    pipeline = round['pipeline']
    if len(pipeline) < 1:
        pipeline = []
    else:
        # exclude passthrough and no scaling for display
        pipeline = [s for s in pipeline if s[0] not in ['NO-SCALE', 'PASS']]
    params = get_round_params(search, round_id)
    features = get_feature_importance(dataset.dataset_id, round_id)
    y_names, cnf_matrix, sums_matrix = get_cnf_matrix(dataset_id, round_id,
                                                      'eval')
    form = DupplicateRound()
    form.set_choices(dataset.problem_type)
    if request.method == 'POST':
        # apply round parameters for searching in another dataset
        lpush_key_store(DUPLICATE_QUEUE, {
            'round': eval(form.round.data),
            'dataset': form.dataset.data
        })
    else:
        form.round.data = str(round)
    return render_template('round.html',
                           dataset=dataset,
                           round=round,
                           pipeline=pipeline,
                           form=form,
                           features=features,
                           params=params,
                           cols=params.keys(),
                           refresher=int(time.time()),
                           y_names=y_names,
                           cnf_matrix=cnf_matrix,
                           sums_matrix=sums_matrix,
                           config=get_config())
Exemplo n.º 4
0

def __select_cat(c, pipeline):
    # select the element in the pipeline with category c
    for p in pipeline:
        if p[1] == c:
            return p
    return '', '', '', ''


for dt in get_dataset_list(include_results=True):
    # check graph folders
    if not os.path.exists(get_dataset_folder(dt.dataset_id) + '/graphs_dark'):
        os.makedirs(get_dataset_folder(dt.dataset_id) + '/graphs_dark')
    # generate best rounds
    if dt.status != 'created':
        print(dt.name)
        # get search history
        df = get_search_rounds(dt.dataset_id)

        # generate graphs
        best = __get_best_models(df)
        best1 = best[best.level == 1]
        best2 = best[best.level == 2]
        graph_history_search(dt, df, best1, 1)
        graph_history_search(dt, df, best2, 2)

        # then update best models & pp
        set_key_store('dataset:%s:best' % dt.dataset_id,
                      best.to_dict(orient='records'))
        set_key_store('dataset:%s:best_pp' % dt.dataset_id, __get_best_pp(df))
Exemplo n.º 5
0
import pickle
from automlk.dataset import get_dataset_list, create_graph_data
from automlk.graphs import *
from automlk.worker import get_search_rounds
from automlk.models import get_pred_eval_test

"""
module specifically designed to update feature round graphs after new version
"""

for dataset in get_dataset_list():
    print('-'*60)
    print(dataset.name)
    ds = pickle.load(open(get_dataset_folder(dataset.dataset_id) + '/data/eval_set.pkl', 'rb'))

    for msg_search in get_search_rounds(dataset.dataset_id).to_dict(orient='records'):
        try:
            print('round:', msg_search['round_id'])
            y_pred_eval, y_pred_test, y_pred_submit = get_pred_eval_test(dataset.dataset_id, msg_search['round_id'])

            # generate graphs
            if dataset.problem_type == 'regression':
                graph_predict_regression(dataset, msg_search['round_id'], ds.y_train, y_pred_eval, 'eval')
                graph_predict_regression(dataset, msg_search['round_id'], ds.y_test, y_pred_test, 'test')
                graph_histogram_regression(dataset, msg_search['round_id'], y_pred_eval, 'eval')
                graph_histogram_regression(dataset, msg_search['round_id'], y_pred_test, 'test')
            else:
                graph_predict_classification(dataset, msg_search['round_id'], ds.y_train, y_pred_eval, 'eval')
                graph_predict_classification(dataset, msg_search['round_id'], ds.y_test, y_pred_test, 'test')
                graph_histogram_classification(dataset, msg_search['round_id'], y_pred_eval, 'eval')
                graph_histogram_classification(dataset, msg_search['round_id'], y_pred_test, 'test')
Exemplo n.º 6
0
def dataset(dataset_id):
    # zoom on a specific dataset
    dataset = get_dataset(dataset_id)
    search = get_search_rounds(dataset.dataset_id)
    doc_path = os.path.abspath(
        get_dataset_folder(dataset_id) + '/docs/_build/html/index.html')
    doc_pdf = os.path.abspath(
        get_dataset_folder(dataset_id) + '/docs/_build/latex/dataset.pdf')
    form = EditFeatureForm()
    form.set_ref_choices([(t.textset_id, t.name) for t in get_textset_list()])
    data_form = DataForm()
    data_form.set_choices([
        f.name for f in dataset.features
        if f.to_keep and f.name != dataset.y_col
    ])
    fe_content = get_feature_engineering(dataset_id)
    metrics_name, metrics_best, metrics_content = get_specific_metrics(
        dataset_id)
    sample = get_dataset_sample(dataset_id)
    if not os.path.exists(doc_path):
        doc_path = ''
    if not os.path.exists(doc_pdf):
        doc_pdf = ''
    if len(search) > 0:
        best = get_best_models(dataset_id)
        best_pp = get_best_pp(dataset_id)
        # separate models (level 0) from ensembles (level 1)
        best1 = [b for b in best if b['level'] == 1]
        best2 = [b for b in best if b['level'] == 2]
        return render_template('dataset.html',
                               dataset=dataset,
                               best1=best1,
                               best2=best2,
                               best_pp=best_pp,
                               n_searches1=len(search[search.level == 1]),
                               n_searches2=len(search[search.level == 2]),
                               form=form,
                               data_form=data_form,
                               doc_path=doc_path,
                               doc_pdf=doc_pdf,
                               fe_content=fe_content,
                               metrics_name=metrics_name,
                               metrics_best=metrics_best,
                               metrics_content=metrics_content,
                               sample=sample,
                               refresher=int(time.time()),
                               config=get_config())
    else:
        return render_template('dataset.html',
                               dataset=dataset,
                               n_searches1=0,
                               doc_path=doc_path,
                               form=form,
                               data_form=data_form,
                               fe_content=fe_content,
                               metrics_name=metrics_name,
                               metrics_best=metrics_best,
                               metrics_content=metrics_content,
                               sample=sample,
                               refresher=int(time.time()),
                               config=get_config())
Exemplo n.º 7
0
def gener_doc(dataset):
    """
    generate the documentation of this dataset

    :param dataset: dataset object
    :return:
    """
    # check or create doc folder
    folder = get_dataset_folder(dataset.dataset_id) + '/docs'
    if not os.path.exists(folder):
        os.makedirs(folder)
        os.makedirs(folder + '/_build')
        os.makedirs(folder + '/_static')
        os.makedirs(folder + '/_templates')

    # generate conf.py
    render('conf.txt', folder + '/conf.py', dataset=dataset)
    render('make.bat', folder + '/make.bat', dataset=dataset)
    render('makefile.txt', folder + '/Makefile', dataset=dataset)

    # generate index
    render('index.rst', folder + '/index.rst', dataset=dataset)

    # dataset data and features
    search = get_search_rounds(dataset.dataset_id)
    if len(search) > 0:
        best = get_best_models(dataset.dataset_id)
        best_pp = get_best_pp(dataset.dataset_id)
        # separate models (level 0) from ensembles (level 1)
        best1 = [b for b in best if b['level'] == 1]
        best2 = [b for b in best if b['level'] == 2]
        print(len(best1), len(best2))
        print(best1[:2])
        render('dataset.rst',
               folder + '/dataset.rst',
               dataset=dataset,
               best1=best1,
               best2=best2,
               best_pp=best_pp,
               n_searches1=len(search[search.level == 1]),
               n_searches2=len(search[search.level == 2]))

        # then for the best rounds
        N_ROUNDS = 5
        for round_id in list([b['round_id'] for b in best1[:N_ROUNDS]]) + list(
            [b['round_id'] for b in best2[:N_ROUNDS]]):
            round = search[search.round_id == int(round_id)].to_dict(
                orient='records')[0]
            pipeline = [
                s for s in round['pipeline']
                if s[0] not in ['NO-SCALE', 'PASS']
            ]
            params = get_round_params(search, round_id)
            features = get_feature_importance(dataset.dataset_id, round_id)
            render('round.rst',
                   folder + '/round_%s.rst' % round_id,
                   dataset=dataset,
                   round=round,
                   pipeline=pipeline,
                   features=features,
                   params=params,
                   cols=params.keys())
    else:
        # return render_template('dataset.html', dataset=dataset, n_searches1=0)
        render('dataset.rst',
               folder + '/dataset.rst',
               dataset=dataset,
               n_searches1=0)

    # then generate html and pdf with make
    if sys.platform == 'linux':
        subprocess.call([
            'sh', '../scripts/gen_doc.sh',
            os.path.abspath(get_dataset_folder(dataset.dataset_id) + '/docs')
        ])
    else:
        os.system(
            'call ../scripts/gen_doc ' +
            os.path.abspath(get_dataset_folder(dataset.dataset_id) + '/docs'))

    # generate zip file of the html site
    with zipfile.ZipFile(
            get_dataset_folder(dataset.dataset_id) + '/doc.zip', 'w') as z:
        root = get_dataset_folder(dataset.dataset_id) + '/docs/_build/html/'
        for dir in ['', '_static/', '_images/', '_sources/']:
            for f in glob.glob(root + dir + '*.*'):
                z.write(f,
                        dataset.dataset_id + '/' + dir + os.path.basename(f))