示例#1
0
def load_data(data_path, processor_paths, fit=True):
    """
    Wrap of LearningDataAdapter and PreProcess to load data in one line.
    
    data_path      : string, path to data, currently support .csv file.
    processor_paths: dict, in the form of {'imputer': 'imputer_save_path',
                                           'scaler' : 'scaler_save_path',
                                           'encoder': 'encoder_save_path'}
    fit            : boolean, default is True. If it is true, it means we need to fit imputer,
                     scaler and encoder and save them in processor_paths. Otherwise, these 
                     transformations are already existed, and we only need to load them.

    """
    # adapter
    adapter = LearningDataAdapter(for_learning=True)
    adapter.adapt_file(data_path)
    X_num, X_cat = adapter.X_num, adapter.X_cat
    w, y = adapter.w, adapter.y

    # preprocessor
    processor = PreProcess()
    if fit:
        processor.fit(X_num, X_cat, processor_paths)
    else:
        processor.load(processor_paths)
    X_trans = processor.transform(X_num, X_cat)
    return X_trans, y, w
    print
    print "+ Connecting to {0} to populate column in table {1}".format(args.dbname, args.input_table_name)
    print

    print "  Loading models."

    logre_evaluator = LoadModelEvaluator(
        "models/logre_imputer.pkl", "models/logre_scaler.pkl", "models/logre_encoder.pkl", "models/logre.pkl"
    )

    gbdt300_evaluator = LoadModelEvaluator(
        "models/gbdt300_imputer.pkl", "models/gbdt300_scaler.pkl", "models/gbdt300_encoder.pkl", "models/gbdt300.pkl"
    )

    adapter = LearningDataAdapter(for_learning=False)

    print

    f = open(args.output_fname, "w")
    f.write("eid,logre_signal_score,gbdt300\n")

    print "  Predicting and updating."
    print "  Started on {0}".format(time.ctime(time.time()))
    with PsqlReader(
        database=args.dbname,
        select_table_name=args.input_table_name,
        itersize=200000,
        arraysize=200000,
        rollback=False,
        debug=False,
import numpy as np
import sklearn.preprocessing as preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.externals import joblib
from LearningDataAdapter import LearningDataAdapter
from ModelEvaluator import ModelEvaluator

if __name__ ==  '__main__':

    # data preprocessing

    print 'Importing training data... '
    adapter = LearningDataAdapter(for_learning=True)
    adapter.adapt_file('data/train.csv')
    X_num, X_cat = adapter.X_num, adapter.X_cat
    w, y = adapter.w, adapter.y

    print 'Fitting preprocessors... '

    # Imputer
    imp = preprocessing.Imputer(
        missing_values='NaN', strategy='mean', axis=0
    )
    imp.fit(X_num)
    X_num_trans = imp.transform(X_num)
    joblib.dump(imp, 'models/gbdt300_imputer.pkl')

    # Scaler
    scaler = preprocessing.StandardScaler(
        with_mean=True,
        with_std=True
示例#4
0
import numpy as np
import sklearn.preprocessing as preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.externals import joblib
from LearningDataAdapter import LearningDataAdapter
from ModelEvaluator import ModelEvaluator

if __name__ == '__main__':

    # data preprocessing

    print 'Importing training data... '
    adapter = LearningDataAdapter(for_learning=True)
    adapter.adapt_file('data/train.csv')
    X_num, X_cat = adapter.X_num, adapter.X_cat
    w, y = adapter.w, adapter.y

    print 'Fitting preprocessors... '

    # Imputer
    imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X_num)
    X_num_trans = imp.transform(X_num)
    joblib.dump(imp, 'models/gbdt300_imputer.pkl')

    # Scaler
    scaler = preprocessing.StandardScaler(with_mean=True, with_std=True)
    scaler.fit(X_num_trans)
    X_num_trans = scaler.transform(X_num_trans)
    joblib.dump(scaler, 'models/gbdt300_scaler.pkl')
import matplotlib.pyplot as plt
from LearningDataAdapter import LearningDataAdapter
from ModelEvaluator import ModelEvaluator
from sklearn.metrics import roc_curve
import glob
from LearningScore import learning_curve

if __name__ ==  '__main__':
    
	
    plt.switch_backend('agg')


    # obtain valid data
    print 'Importing test sample... '
    adapter = LearningDataAdapter(for_learning=True)
    adapter.adapt_file('data/validate.csv')
    X_num, X_cat = adapter.X_num, adapter.X_cat

    imp = joblib.load("./models/imputer.pkl")
    scaler = joblib.load("./models/scaler.pkl")
    enc = joblib.load("./models/encoder.pkl")
    evaluator = ModelEvaluator(
    imputer=imp, scaler=scaler,
    encoder=enc)
    
    pred_x = evaluator.preprocess(X_num,X_cat)
    pred_w, pred_y = adapter.w, adapter.y
    print

示例#6
0
    print

    print '  Adding new column to table.'
    add_table_column(args.dbname, args.table_name)
    print

    print '  Loading models.'
    imp = joblib.load('models/imputer.pkl')
    scaler = joblib.load('models/scaler.pkl')
    enc = joblib.load('models/encoder.pkl')
    rf = joblib.load('models/rf.pkl')
    evaluator = ModelEvaluator(imputer=imp,
                               scaler=scaler,
                               encoder=enc,
                               model=rf)
    adapter = LearningDataAdapter(for_learning=False)
    print

    print '  Predicting and updating.'
    print '  Started on {0}'.format(time.ctime(time.time()))
    with SqlDataLoader(database=args.dbname,
                       table_name=args.table_name,
                       itersize=200000,
                       arraysize=200000,
                       rollback=False,
                       debug=False) as sql_loader:
        sql_loader.start()
        while sql_loader.curr_records:
            sys.stdout.write('.')
            sys.stdout.flush()
            adapter.adapt_records(sql_loader.curr_records)
示例#7
0
    print '  Number of events with successful choices: {0}'.format(n_success)
    print '  Ratio: {0}'.format(n_success / float(n_hot))
    print


if __name__ == '__main__':

    print '+ Loading pickled objects... '
    imp = joblib.load('models/imputer.pkl')
    scaler = joblib.load('models/scaler.pkl')
    enc = joblib.load('models/encoder.pkl')
    rf = joblib.load('models/rf.pkl')
    print

    print '+ Importing test sample... '
    adapter = LearningDataAdapter(for_learning=True)
    adapter.adapt_file('data/validate.csv')
    print

    print '+ Predicting test sample candidate scores... '
    evaluator = ModelEvaluator(imputer=imp,
                               scaler=scaler,
                               encoder=enc,
                               model=rf)
    score = evaluator.predict_proba(adapter.X_num, adapter.X_cat)[:, 1]
    print

    print '+ Assessing model results... '
    print

    print '  Selecting best candidate... '