示例#1
0
def contract_view():
    template = 'contract_view.html'
    object_list = data.load_data()
    districts_list = data.load_districts()
    products_list = data.dollars_by_product_service_code(object_list)
    recipient_city_list = data.contracts_by_city(object_list)
    return render_template(template, object_list=object_list, districts_list=districts_list, products_list=products_list, recipient_city_list=recipient_city_list)
示例#2
0
def overview_data():
    # form = searchForm(request.form)
    template = 'company_overview.html'
    object_list = data.load_data()
    products_list = data.dollars_by_product_service_code(object_list)
    recipient_city_list = data.contracts_by_city(object_list)
    return render_template(template, object_list=object_list, products_list=products_list, recipient_city_list=recipient_city_list)
示例#3
0
def list(city_id):
    template = 'geo-list.html'
    object_list = data.load_data()
    cities_list = data.contracts_by_city(object_list)
    for c in cities_list:
        if c[5] == city_id:
            object_list = [o for o in object_list if c[5] == o["RecipientCity"]]
            return render_template(template, object_list=object_list)
    abort(404)
示例#4
0
def contract_list(product_id):
    template = 'list.html'
    object_list = data.load_data()
    products_list = data.dollars_by_product_service_code(object_list)
    for p in products_list:
        if p[4] == product_id:
            object_list = [o for o in object_list if p[0] == o["ProductorServiceCode"]]
            return render_template(template, object_list=object_list)
    abort(404)
示例#5
0
def check_agreement(model, variables):
    check_data = load_data('check_agreement')
    probs = model.predict_proba(check_data[variables])[:, 1]

    ks = evaluation.compute_ks(
        probs[check_data['signal'].values == 0],
        probs[check_data['signal'].values == 1],
        check_data[check_data['signal'] == 0]['weight'].values,
        check_data[check_data['signal'] == 1]['weight'].values)

    bot.info('KS metric %s %s' % (ks, ks < 0.09))
    return ks
示例#6
0
def main(CONFIG):
    """
    The final model is a combination of several base models, which are then
    combined using StackedClassifier defined in the helpers.ml module.
    The list of models and associated datasets is generated automatically
    from their identifying strings. The format is as follows:
    A:b_c where A is the initials of the algorithm to use, b is the base
    dataset, and c is the feature set and the variants to use.
    """
    SEED = 42
    selected_models = [
        "LR:tuples_sf",
        "LR:greedy_sfl",
        "LR:greedy2_sfl",
        "LR:greedy3_sf",
        "RFC:basic_b",
        "RFC:tuples_f",
        "RFC:tuples_fd",
        "RFC:greedy_f",
        "RFC:greedy2_f",
        "GBC:basic_f",
        "GBC:tuples_f",
        "LR:greedy_sbl",
        "GBC:greedy_c",
        "GBC:tuples_cf",
        #"RFC:effects_f",  # experimental; added after the competition
    ]

    # Create the models on the fly
    models = []
    for item in selected_models:
        model_id, dataset = item.split(':')
        model = {'LR': linear_model.LogisticRegression,
                 'GBC': ensemble.GradientBoostingClassifier,
                 'RFC': ensemble.RandomForestClassifier,
                 'ETC': ensemble.ExtraTreesClassifier}[model_id]()
        model.set_params(random_state=SEED)
        models.append((model, dataset))

    datasets = [dataset for model, dataset in models]

    logger.info("loading data")
    y, X = load_data('train.csv')
    X_test = load_data('test.csv', return_labels=False)

    logger.info("preparing datasets (use_cache=%s)", str(CONFIG.use_cache))
    create_datasets(X, X_test, y, datasets, CONFIG.use_cache)

    # Set params
    for model, feature_set in models:
        model.set_params(**ml.find_params(model, feature_set, y,
                                          grid_search=CONFIG.grid_search))
    clf = ml.StackedClassifier(
        models, stack=CONFIG.stack, fwls=CONFIG.fwls,
        model_selection=CONFIG.model_selection,
        use_cached_models=CONFIG.use_cache)

    #  Metrics
    logger.info("computing cv score")
    mean_auc = 0.0
    for i in range(CONFIG.iter):
        train, cv = cross_validation.train_test_split(
            range(len(y)), test_size=.20, random_state=1+i*SEED)
        cv_preds = clf.fit_predict(y, train, cv, show_steps=CONFIG.verbose)

        fpr, tpr, _ = metrics.roc_curve(y[cv], cv_preds)
        roc_auc = metrics.auc(fpr, tpr)
        logger.info("AUC (fold %d/%d): %.5f", i + 1, CONFIG.iter, roc_auc)
        mean_auc += roc_auc

        if CONFIG.diagnostics and i == 0:  # only plot for first fold
            logger.info("plotting learning curve")
            diagnostics.learning_curve(clf, y, train, cv)
            diagnostics.plot_roc(fpr, tpr)
    if CONFIG.iter:
        logger.info("Mean AUC: %.5f",  mean_auc/CONFIG.iter)

    # Create submissions
    if CONFIG.outputfile:
        logger.info("making test submissions (CV AUC: %.4f)", mean_auc)
        preds = clf.fit_predict(y, show_steps=CONFIG.verbose)
        save_results(preds, CONFIG.outputfile + ".csv")
示例#7
0
def check_correlation(model,variables):
    check_data = load_data('check_correlation')
    probs = model.predict_proba(check_data[variables])[:, 1]
    cvm = compute_cvm(probs, check_data['mass'])
    bot.info('CvM metric %s %s' %(cvm, cvm < 0.002))
    return cvm
示例#8
0
def main(CONFIG):
    """
    The final model is a combination of several base models, which are then
    combined using StackedClassifier defined in the helpers.ml module.

    The list of models and associated datasets is generated automatically
    from their identifying strings. The format is as follows:
    A:b_c where A is the initials of the algorithm to use, b is the base
    dataset, and c is the feature set and the variants to use.
    """
    SEED = 42
    selected_models = [
        "GBC:expanded",
    ]

    # Create the models on the fly
    models = []
    for item in selected_models:
        model_id, dataset = item.split(':')
        model = {
            'LR': linear_model.LogisticRegression,
            'GBC': ensemble.GradientBoostingClassifier,
            'RFC': ensemble.RandomForestClassifier,
            'MLP': neural_network.MLPClassifier,
            'ETC': ensemble.ExtraTreesClassifier
        }[model_id]()
        model.set_params(random_state=SEED)
        models.append((model, dataset))

    #datasets = [dataset for model, dataset in models]
    datasets = ["basic", "residuals", "stats", "expanded"]

    logger.info("loading data")
    y, x = load_data('train.csv')
    x_test = load_data('test.csv', return_labels=False)

    logger.info("preparing datasets (use_cache=%s)", str(CONFIG.use_cache))
    create_datasets(x, x_test, y, datasets, CONFIG.use_cache)

    # Set params
    for model, feature_set in models:
        model.set_params(**ml.find_params(
            model, feature_set, y, grid_search=CONFIG.grid_search))
    clf = ml.StackedClassifier(models,
                               stack=CONFIG.stack,
                               fwls=CONFIG.fwls,
                               model_selection=CONFIG.model_selection,
                               use_cached_models=CONFIG.use_cache)

    # Results
    # Basic dataset
    #GBC:basic - 5 it: 0.54569
    #LR:basic - 5 it: 0.49412

    #Series dataset
    #GBC:stats - 5 it: 0.76338
    #MLP:stats - 5 it: 0.62772
    #GBC:stats" + RFC:stats - 5 it: 0.73487

    #Expanded dataset
    # GBC:expanded - 5 it: 0.88390
    # RFC:expanded - 5 it: 0.87114
    # ETC:expanded - 5 it: 0.86704
    # GBC + RFC:expanded - 5 it: 0.87741

    #  Metrics
    logger.info("computing cv score")
    mean_auc = 0.0
    for i in range(CONFIG.iter):
        train, cv = model_selection.train_test_split(range(len(y)),
                                                     test_size=.20,
                                                     random_state=1 + i * SEED)

        cv_preds = clf.fit_predict(y, train, cv, show_steps=CONFIG.verbose)

        fpr, tpr, _ = metrics.roc_curve(y[cv], cv_preds)
        roc_auc = metrics.auc(fpr, tpr)
        logger.info("AUC (it %d/%d): %.5f", i + 1, CONFIG.iter, roc_auc)
        mean_auc += roc_auc

        if CONFIG.diagnostics and i == 0:  # only plot for first it
            logger.info("plotting learning curve")
            clf.use_cached_models = False
            diagnostics.learning_curve(clf, y, train, cv, n=10)
            clf.use_cached_models = True
            diagnostics.plot_roc(fpr, tpr)
    if CONFIG.iter:
        logger.info("Mean AUC: %.5f", mean_auc / CONFIG.iter)

    # Create submissions
    if CONFIG.outputfile:
        logger.info("making test submissions (CV AUC: %.4f)", mean_auc)
        preds = clf.fit_predict(y, show_steps=CONFIG.verbose)
        save_results(preds, CONFIG.outputfile + ".csv")
示例#9
0
#!/usr/bin/env python

from sklearn.ensemble import GradientBoostingClassifier
from helpers.data import load_data
from helpers.logger import bot
import pandas

#bot.debug("Hello, here is a debug message!")
#bot.debug("Hello, here is a warning message!")
#bot.debug("Hello, here is an error message!")
#bot.debug("Hello, here is an info message!")

train = load_data(name="training")

## BUILD MODEL HERE ####################################

# Baseline is provided as an example

variables = ['LifeTime', 'FlightDistance', 'pt']

baseline = GradientBoostingClassifier(n_estimators=40,
                                      learning_rate=0.01,
                                      subsample=0.7,
                                      min_samples_leaf=10,
                                      max_depth=7,
                                      random_state=11)

baseline.fit(train[variables], train['signal'])

# MODEL TESTING ########################################
示例#10
0
def main(CONFIG):
    """
    The final model is a combination of several base models, which are then
    combined using StackedClassifier defined in the helpers.ml module.

    The list of models and associated datasets is generated automatically
    from their identifying strings. The format is as follows:
    A:b_c where A is the initials of the algorithm to use, b is the base
    dataset, and c is the feature set and the variants to use.
    """
    SEED = 42
    selected_models = [
        "LR:tuples_sf",
        "LR:greedy_sfl",
        "LR:greedy2_sfl",
        "LR:greedy3_sf",
        "RFC:basic_b",
        "RFC:tuples_f",
        "RFC:tuples_fd",
        "RFC:greedy_f",
        "RFC:greedy2_f",
        "GBC:basic_f",
        "GBC:tuples_f",
        "LR:greedy_sbl",
        "GBC:greedy_c",
        "GBC:tuples_cf",
        #"RFC:effects_f",  # experimental; added after the competition
    ]

    # Create the models on the fly
    models = []
    for item in selected_models:
        model_id, dataset = item.split(':')
        model = {'LR': linear_model.LogisticRegression,
                 'GBC': ensemble.GradientBoostingClassifier,
                 'RFC': ensemble.RandomForestClassifier,
                 'ETC': ensemble.ExtraTreesClassifier}[model_id]()
        model.set_params(random_state=SEED)
        models.append((model, dataset))

    datasets = [dataset for model, dataset in models]

    logger.info("loading data")
    y, X = load_data('train.csv')
    X_test = load_data('test.csv', return_labels=False)

    logger.info("preparing datasets (use_cache=%s)", str(CONFIG.use_cache))
    create_datasets(X, X_test, y, datasets, CONFIG.use_cache)

    # Set params
    for model, feature_set in models:
        model.set_params(**ml.find_params(model, feature_set, y,
                                          grid_search=CONFIG.grid_search))
    clf = ml.StackedClassifier(
        models, stack=CONFIG.stack, fwls=CONFIG.fwls,
        model_selection=CONFIG.model_selection,
        use_cached_models=CONFIG.use_cache)

    #  Metrics
    logger.info("computing cv score")
    mean_auc = 0.0
    for i in range(CONFIG.iter):
        train, cv = cross_validation.train_test_split(
            range(len(y)), test_size=.20, random_state=1+i*SEED)
        cv_preds = clf.fit_predict(y, train, cv, show_steps=CONFIG.verbose)

        fpr, tpr, _ = metrics.roc_curve(y[cv], cv_preds)
        roc_auc = metrics.auc(fpr, tpr)
        logger.info("AUC (fold %d/%d): %.5f", i + 1, CONFIG.iter, roc_auc)
        mean_auc += roc_auc

        if CONFIG.diagnostics and i == 0:  # only plot for first fold
            logger.info("plotting learning curve")
            diagnostics.learning_curve(clf, y, train, cv)
            diagnostics.plot_roc(fpr, tpr)
    if CONFIG.iter:
        logger.info("Mean AUC: %.5f",  mean_auc/CONFIG.iter)

    # Create submissions
    if CONFIG.outputfile:
        logger.info("making test submissions (CV AUC: %.4f)", mean_auc)
        preds = clf.fit_predict(y, show_steps=CONFIG.verbose)
        save_results(preds, CONFIG.outputfile + ".csv")
示例#11
0
    def clamp(x):
        return min(max(x, .00000001), .99999999)
    return np.vectorize(lambda x: -math.log((1 - clamp(x))/clamp(x)))(X)


def print_param(obj, params, prefix=''):
    for param in params:
        if hasattr(obj, param):
            paramvalue = getattr(obj, param)
            if "coef" in param:
                paramvalue /= np.sum(paramvalue)
            print prefix + param + ": " + str(paramvalue)


mean_prediction = 0.0
y = load_data('train.csv')[0]
y = y[range(len(y) - 7770, len(y))]

files = ["log75", "paul", "enp", "NB55", "NB45"]
totransform = []

preds = []
for filename in files:
    with open("internal/new/final/%s.csv" % filename) as f:
        pred = np.loadtxt(f, delimiter=',', usecols=[1], skiprows=1)
        if filename in totransform:
            pred = inverse_transform(pred)
        preds.append(pred)
X = np.array(preds).T

standardizer = preprocessing.StandardScaler()
示例#12
0
def city_contract(city_id, contract_id):
    template = 'contract.html'
    object_list = data.load_data()
    record = [o for o in object_list if o['record_count'] == contract_id]
    return render_template(template, object=record)