Python load_data 예제들, helpers.data.load_data Python 예제들

예제 #1

0

파일 보기

파일: app.py 프로젝트: dannguyen/design_prototype

def contract_view():
    template = 'contract_view.html'
    object_list = data.load_data()
    districts_list = data.load_districts()
    products_list = data.dollars_by_product_service_code(object_list)
    recipient_city_list = data.contracts_by_city(object_list)
    return render_template(template, object_list=object_list, districts_list=districts_list, products_list=products_list, recipient_city_list=recipient_city_list)

예제 #2

0

파일 보기

파일: app.py 프로젝트: dannguyen/design_prototype

def overview_data():
    # form = searchForm(request.form)
    template = 'company_overview.html'
    object_list = data.load_data()
    products_list = data.dollars_by_product_service_code(object_list)
    recipient_city_list = data.contracts_by_city(object_list)
    return render_template(template, object_list=object_list, products_list=products_list, recipient_city_list=recipient_city_list)

예제 #3

0

파일 보기

파일: app.py 프로젝트: dannguyen/design_prototype

def list(city_id):
    template = 'geo-list.html'
    object_list = data.load_data()
    cities_list = data.contracts_by_city(object_list)
    for c in cities_list:
        if c[5] == city_id:
            object_list = [o for o in object_list if c[5] == o["RecipientCity"]]
            return render_template(template, object_list=object_list)
    abort(404)

예제 #4

0

파일 보기

파일: app.py 프로젝트: dannguyen/design_prototype

def contract_list(product_id):
    template = 'list.html'
    object_list = data.load_data()
    products_list = data.dollars_by_product_service_code(object_list)
    for p in products_list:
        if p[4] == product_id:
            object_list = [o for o in object_list if p[0] == o["ProductorServiceCode"]]
            return render_template(template, object_list=object_list)
    abort(404)

예제 #5

0

파일 보기

파일: metrics.py 프로젝트: pombredanne/cftw

def check_agreement(model, variables):
    check_data = load_data('check_agreement')
    probs = model.predict_proba(check_data[variables])[:, 1]

    ks = evaluation.compute_ks(
        probs[check_data['signal'].values == 0],
        probs[check_data['signal'].values == 1],
        check_data[check_data['signal'] == 0]['weight'].values,
        check_data[check_data['signal'] == 1]['weight'].values)

    bot.info('KS metric %s %s' % (ks, ks < 0.09))
    return ks

예제 #6

0

파일 보기

파일: class.py 프로젝트: Jshahan/Amazon-Employee-Access

def main(CONFIG):
    """
    The final model is a combination of several base models, which are then
    combined using StackedClassifier defined in the helpers.ml module.
    The list of models and associated datasets is generated automatically
    from their identifying strings. The format is as follows:
    A:b_c where A is the initials of the algorithm to use, b is the base
    dataset, and c is the feature set and the variants to use.
    """
    SEED = 42
    selected_models = [
        "LR:tuples_sf",
        "LR:greedy_sfl",
        "LR:greedy2_sfl",
        "LR:greedy3_sf",
        "RFC:basic_b",
        "RFC:tuples_f",
        "RFC:tuples_fd",
        "RFC:greedy_f",
        "RFC:greedy2_f",
        "GBC:basic_f",
        "GBC:tuples_f",
        "LR:greedy_sbl",
        "GBC:greedy_c",
        "GBC:tuples_cf",
        #"RFC:effects_f",  # experimental; added after the competition
    ]

    # Create the models on the fly
    models = []
    for item in selected_models:
        model_id, dataset = item.split(':')
        model = {'LR': linear_model.LogisticRegression,
                 'GBC': ensemble.GradientBoostingClassifier,
                 'RFC': ensemble.RandomForestClassifier,
                 'ETC': ensemble.ExtraTreesClassifier}[model_id]()
        model.set_params(random_state=SEED)
        models.append((model, dataset))

    datasets = [dataset for model, dataset in models]

    logger.info("loading data")
    y, X = load_data('train.csv')
    X_test = load_data('test.csv', return_labels=False)

    logger.info("preparing datasets (use_cache=%s)", str(CONFIG.use_cache))
    create_datasets(X, X_test, y, datasets, CONFIG.use_cache)

    # Set params
    for model, feature_set in models:
        model.set_params(**ml.find_params(model, feature_set, y,
                                          grid_search=CONFIG.grid_search))
    clf = ml.StackedClassifier(
        models, stack=CONFIG.stack, fwls=CONFIG.fwls,
        model_selection=CONFIG.model_selection,
        use_cached_models=CONFIG.use_cache)

    #  Metrics
    logger.info("computing cv score")
    mean_auc = 0.0
    for i in range(CONFIG.iter):
        train, cv = cross_validation.train_test_split(
            range(len(y)), test_size=.20, random_state=1+i*SEED)
        cv_preds = clf.fit_predict(y, train, cv, show_steps=CONFIG.verbose)

        fpr, tpr, _ = metrics.roc_curve(y[cv], cv_preds)
        roc_auc = metrics.auc(fpr, tpr)
        logger.info("AUC (fold %d/%d): %.5f", i + 1, CONFIG.iter, roc_auc)
        mean_auc += roc_auc

        if CONFIG.diagnostics and i == 0:  # only plot for first fold
            logger.info("plotting learning curve")
            diagnostics.learning_curve(clf, y, train, cv)
            diagnostics.plot_roc(fpr, tpr)
    if CONFIG.iter:
        logger.info("Mean AUC: %.5f",  mean_auc/CONFIG.iter)

    # Create submissions
    if CONFIG.outputfile:
        logger.info("making test submissions (CV AUC: %.4f)", mean_auc)
        preds = clf.fit_predict(y, show_steps=CONFIG.verbose)
        save_results(preds, CONFIG.outputfile + ".csv")

예제 #7

0

파일 보기

파일: metrics.py 프로젝트: sci-f/flavours-of-physics.scif

def check_correlation(model,variables):
    check_data = load_data('check_correlation')
    probs = model.predict_proba(check_data[variables])[:, 1]
    cvm = compute_cvm(probs, check_data['mass'])
    bot.info('CvM metric %s %s' %(cvm, cvm < 0.002))
    return cvm

예제 #8

0

파일 보기

def main(CONFIG):
    """
    The final model is a combination of several base models, which are then
    combined using StackedClassifier defined in the helpers.ml module.

    The list of models and associated datasets is generated automatically
    from their identifying strings. The format is as follows:
    A:b_c where A is the initials of the algorithm to use, b is the base
    dataset, and c is the feature set and the variants to use.
    """
    SEED = 42
    selected_models = [
        "GBC:expanded",
    ]

    # Create the models on the fly
    models = []
    for item in selected_models:
        model_id, dataset = item.split(':')
        model = {
            'LR': linear_model.LogisticRegression,
            'GBC': ensemble.GradientBoostingClassifier,
            'RFC': ensemble.RandomForestClassifier,
            'MLP': neural_network.MLPClassifier,
            'ETC': ensemble.ExtraTreesClassifier
        }[model_id]()
        model.set_params(random_state=SEED)
        models.append((model, dataset))

    #datasets = [dataset for model, dataset in models]
    datasets = ["basic", "residuals", "stats", "expanded"]

    logger.info("loading data")
    y, x = load_data('train.csv')
    x_test = load_data('test.csv', return_labels=False)

    logger.info("preparing datasets (use_cache=%s)", str(CONFIG.use_cache))
    create_datasets(x, x_test, y, datasets, CONFIG.use_cache)

    # Set params
    for model, feature_set in models:
        model.set_params(**ml.find_params(
            model, feature_set, y, grid_search=CONFIG.grid_search))
    clf = ml.StackedClassifier(models,
                               stack=CONFIG.stack,
                               fwls=CONFIG.fwls,
                               model_selection=CONFIG.model_selection,
                               use_cached_models=CONFIG.use_cache)

    # Results
    # Basic dataset
    #GBC:basic - 5 it: 0.54569
    #LR:basic - 5 it: 0.49412

    #Series dataset
    #GBC:stats - 5 it: 0.76338
    #MLP:stats - 5 it: 0.62772
    #GBC:stats" + RFC:stats - 5 it: 0.73487

    #Expanded dataset
    # GBC:expanded - 5 it: 0.88390
    # RFC:expanded - 5 it: 0.87114
    # ETC:expanded - 5 it: 0.86704
    # GBC + RFC:expanded - 5 it: 0.87741

    #  Metrics
    logger.info("computing cv score")
    mean_auc = 0.0
    for i in range(CONFIG.iter):
        train, cv = model_selection.train_test_split(range(len(y)),
                                                     test_size=.20,
                                                     random_state=1 + i * SEED)

        cv_preds = clf.fit_predict(y, train, cv, show_steps=CONFIG.verbose)

        fpr, tpr, _ = metrics.roc_curve(y[cv], cv_preds)
        roc_auc = metrics.auc(fpr, tpr)
        logger.info("AUC (it %d/%d): %.5f", i + 1, CONFIG.iter, roc_auc)
        mean_auc += roc_auc

        if CONFIG.diagnostics and i == 0:  # only plot for first it
            logger.info("plotting learning curve")
            clf.use_cached_models = False
            diagnostics.learning_curve(clf, y, train, cv, n=10)
            clf.use_cached_models = True
            diagnostics.plot_roc(fpr, tpr)
    if CONFIG.iter:
        logger.info("Mean AUC: %.5f", mean_auc / CONFIG.iter)

    # Create submissions
    if CONFIG.outputfile:
        logger.info("making test submissions (CV AUC: %.4f)", mean_auc)
        preds = clf.fit_predict(y, show_steps=CONFIG.verbose)
        save_results(preds, CONFIG.outputfile + ".csv")

예제 #9

0

파일 보기

파일: main.py 프로젝트: vsoch/flavours-of-physics-ftw

#!/usr/bin/env python

from sklearn.ensemble import GradientBoostingClassifier
from helpers.data import load_data
from helpers.logger import bot
import pandas

#bot.debug("Hello, here is a debug message!")
#bot.debug("Hello, here is a warning message!")
#bot.debug("Hello, here is an error message!")
#bot.debug("Hello, here is an info message!")

train = load_data(name="training")

## BUILD MODEL HERE ####################################

# Baseline is provided as an example

variables = ['LifeTime', 'FlightDistance', 'pt']

baseline = GradientBoostingClassifier(n_estimators=40,
                                      learning_rate=0.01,
                                      subsample=0.7,
                                      min_samples_leaf=10,
                                      max_depth=7,
                                      random_state=11)

baseline.fit(train[variables], train['signal'])

# MODEL TESTING ########################################

예제 #10

0

파일 보기

def main(CONFIG):
    """
    The final model is a combination of several base models, which are then
    combined using StackedClassifier defined in the helpers.ml module.

    The list of models and associated datasets is generated automatically
    from their identifying strings. The format is as follows:
    A:b_c where A is the initials of the algorithm to use, b is the base
    dataset, and c is the feature set and the variants to use.
    """
    SEED = 42
    selected_models = [
        "LR:tuples_sf",
        "LR:greedy_sfl",
        "LR:greedy2_sfl",
        "LR:greedy3_sf",
        "RFC:basic_b",
        "RFC:tuples_f",
        "RFC:tuples_fd",
        "RFC:greedy_f",
        "RFC:greedy2_f",
        "GBC:basic_f",
        "GBC:tuples_f",
        "LR:greedy_sbl",
        "GBC:greedy_c",
        "GBC:tuples_cf",
        #"RFC:effects_f",  # experimental; added after the competition
    ]

    # Create the models on the fly
    models = []
    for item in selected_models:
        model_id, dataset = item.split(':')
        model = {'LR': linear_model.LogisticRegression,
                 'GBC': ensemble.GradientBoostingClassifier,
                 'RFC': ensemble.RandomForestClassifier,
                 'ETC': ensemble.ExtraTreesClassifier}[model_id]()
        model.set_params(random_state=SEED)
        models.append((model, dataset))

    datasets = [dataset for model, dataset in models]

    logger.info("loading data")
    y, X = load_data('train.csv')
    X_test = load_data('test.csv', return_labels=False)

    logger.info("preparing datasets (use_cache=%s)", str(CONFIG.use_cache))
    create_datasets(X, X_test, y, datasets, CONFIG.use_cache)

    # Set params
    for model, feature_set in models:
        model.set_params(**ml.find_params(model, feature_set, y,
                                          grid_search=CONFIG.grid_search))
    clf = ml.StackedClassifier(
        models, stack=CONFIG.stack, fwls=CONFIG.fwls,
        model_selection=CONFIG.model_selection,
        use_cached_models=CONFIG.use_cache)

    #  Metrics
    logger.info("computing cv score")
    mean_auc = 0.0
    for i in range(CONFIG.iter):
        train, cv = cross_validation.train_test_split(
            range(len(y)), test_size=.20, random_state=1+i*SEED)
        cv_preds = clf.fit_predict(y, train, cv, show_steps=CONFIG.verbose)

        fpr, tpr, _ = metrics.roc_curve(y[cv], cv_preds)
        roc_auc = metrics.auc(fpr, tpr)
        logger.info("AUC (fold %d/%d): %.5f", i + 1, CONFIG.iter, roc_auc)
        mean_auc += roc_auc

        if CONFIG.diagnostics and i == 0:  # only plot for first fold
            logger.info("plotting learning curve")
            diagnostics.learning_curve(clf, y, train, cv)
            diagnostics.plot_roc(fpr, tpr)
    if CONFIG.iter:
        logger.info("Mean AUC: %.5f",  mean_auc/CONFIG.iter)

    # Create submissions
    if CONFIG.outputfile:
        logger.info("making test submissions (CV AUC: %.4f)", mean_auc)
        preds = clf.fit_predict(y, show_steps=CONFIG.verbose)
        save_results(preds, CONFIG.outputfile + ".csv")

예제 #11

0

파일 보기

파일: combine.py 프로젝트: bigbear2017/amazonaccess

    def clamp(x):
        return min(max(x, .00000001), .99999999)
    return np.vectorize(lambda x: -math.log((1 - clamp(x))/clamp(x)))(X)


def print_param(obj, params, prefix=''):
    for param in params:
        if hasattr(obj, param):
            paramvalue = getattr(obj, param)
            if "coef" in param:
                paramvalue /= np.sum(paramvalue)
            print prefix + param + ": " + str(paramvalue)


mean_prediction = 0.0
y = load_data('train.csv')[0]
y = y[range(len(y) - 7770, len(y))]

files = ["log75", "paul", "enp", "NB55", "NB45"]
totransform = []

preds = []
for filename in files:
    with open("internal/new/final/%s.csv" % filename) as f:
        pred = np.loadtxt(f, delimiter=',', usecols=[1], skiprows=1)
        if filename in totransform:
            pred = inverse_transform(pred)
        preds.append(pred)
X = np.array(preds).T

standardizer = preprocessing.StandardScaler()

예제 #12

0

파일 보기

파일: app.py 프로젝트: dannguyen/design_prototype

def city_contract(city_id, contract_id):
    template = 'contract.html'
    object_list = data.load_data()
    record = [o for o in object_list if o['record_count'] == contract_id]
    return render_template(template, object=record)