Exemplo n.º 1
0
import os
import pprint
sys.path.append(
    os.path.join(
        os.environ['CCETC_ROOT']
    )
)
import ccetc_py.info
from lib.project import project

#Init projects
projects = {}
with open("res/project_list.json") as f:
    projects_json = json.loads(f.read())
for project_json in projects_json:
    p = project( project_json )
    projects[project_json["name"]] = p

#Init nodes and groups
nodes = ccetc_py.info.nodes()
groups = ccetc_py.info.groups()

def getProject(project):
    '''If it exists, returns project. Else, returns None'''
    if project in projects: return projects[project]
    else: return None

def getNode(node):
    '''If it exists, returns node object. Else, returns None'''
    if node in nodes: return nodes[node]
    else: return None
Exemplo n.º 2
0
    logging.info('Loading test dataset')
    test_df = load_test_df(conf['svdff.dataset'])

    logging.info('Computing test features')
    X = compute_feature_matrix(test_df, vectorizer, combine='stack')

    logging.info('Computing test SVD features')
    U = X.dot(VT.transpose().dot(Sinv))

    logging.info('Symmetrizing input features')
    Uq1, Uq2 = np.vsplit(U, 2)
    U = np.hstack([(Uq1 + Uq2) / 2.0, (Uq1 - Uq2) / 2.0])

    logging.info('Applying models to test dataset')
    test_df['svdff'] = np.zeros(U.shape[0])
    for q in quality['folds']:
        f = load_model(q['dump'])
        p = f.predict_proba(U).flatten()
        test_df['svdff'] = test_df['svdff'] + logit(p)
    test_df['svdff'] = test_df['svdff'] / len(quality['folds'])

    logging.info('Writing test dataset')
    test_df[[
        FieldsTest.test_id,
        'svdff',
    ]].to_csv(join_path(dump_dir, 'test.csv'), index=False)


if __name__ == '__main__':
    main(project().conf)
    with open('hyperopt_trials.json', 'w') as f:
        json.dump(trials.results, f, indent=4)

    logging.info('Best parameters: %s', opt)

    best_trial, best_trial_result = min(enumerate(trials.results), key=lambda r: r[1]['loss'])
    logging.info('Best model %d: AUC=%s, model=%s' % (
        best_trial, best_trial_result['quality']['valid']['auc'], best_trial_result['model']['file']))

    best_model = CatBoostClassifier()
    best_model.load_model(best_trial_result['model']['file'])
    return best_trial_result['quality']['train'], best_trial_result['quality']['valid'], best_model


if __name__ == '__main__':
    conf = project().conf

    dump_dir = abspath(conf['catboost']['dump']['dir'])
    makedirs(dump_dir)

    write_config(conf, join_path(dump_dir, 'application.conf'), 'hocon')
    write_config(conf, join_path(dump_dir, 'application.json'), 'json')
    logging.getLogger().addHandler(logging.FileHandler(join_path(dump_dir, 'application.log')))

    logging.info('Kaggle Talking Data')
    logging.info('Train Catboost')
    logging.info('Dump: %s', dump_dir)

    target = conf['catboost']['target']
    features = conf['catboost']['features']
    categorical_features = conf['catboost']['categorical_features']