示例#1
0
def model_train_validation(ins_file, oos_file, classifier, var_list_filename,
                           result_dir, output_suffix):
    """
    train model
    evaluate on the train and validation data
    evaluate the model performance on the train and validation data
    """
    #################### Load train and validation data ####################
    print 'Loading data for modeling starts ...'
    t0 = time.time()
    target_name = 'target'
    X, y = load_data(ins_file, var_list_filename, target_name)
    Xv, yv = load_data(oos_file, var_list_filename, target_name)
    print "Loading data done, taking ", time.time() - t0, "secs"

    # Train Model
    print '\nModel training starts...'
    t0 = time.time()
    model = classifier
    model.fit(X, y)
    print "Model training done, taking ", time.time() - t0, "secs"
    pickle.dump(model, open(result_dir + "model.p",
                            'wb'))  # save model to disk

    # Predict Train
    y_pred = model.predict(X)
    p_pred = model.predict_proba(X)
    p_pred = p_pred[:, 1]

    # Predict Validation
    yv_pred = model.predict(Xv)
    pv_pred = model.predict_proba(Xv)
    pv_pred = pv_pred[:, 1]

    # Performance Evaluation: Train and Validation
    performance_eval_train_validation(y, p_pred, yv, pv_pred, result_dir,
                                      output_suffix)

    #################### Random Forest Feature Importance ######################
    try:
        varlist_file = open(var_list_filename, 'rU')
        varlist_csv = csv.reader(varlist_file)
        var_list = []
        for row in varlist_csv:
            var_list.append(row[0])
        out_feat_import = open(
            result_dir + 'feature_import_' + str(output_suffix) + '.csv', 'wb')
        feat_import_csv = csv.writer(out_feat_import)
        var_import = zip(range(len(var_list)), var_list,
                         model.feature_importances_)
        feat_import_csv.writerow(['var seq num', 'var name', 'importance'])
        print "RandomForest classifier, var importance was output"
        for row in var_import:
            feat_import_csv.writerow(row)
    except:
        print "Not RandomForest classifier, var importance not created"
def model_train_validation(ins_file, oos_file, classifier, var_list_filename, output_dir, outpu):
    """
    train model
    evaluate on the train and validation data
    evaluate the model performance on the train and validation data
    """
    #################### Load train and validation data ####################
    print 'Loading data for modeling starts ...'
    t0=time.time()
    target_name='target'
    X,y = load_data_fast(ins_file, var_list_filename, target_name)
    Xv,yv = load_data_fast(oos_file, var_list_filename, target_name)
    print "Loading data done, taking ",time.time()-t0,"secs"
    
    # prepare trivial input values for generating reason code in production
    trivial_input_values_file = output_dir+'trivial_input_values.p'
    trivial_input_values = median(X,axis=0)
    pickle.dump(trivial_input_values,open(trivial_input_values_file,'wb'))
    
    # Train Model
    print '\nModel training starts...'
    t0=time.time()
    model = classifier
    model.fit(X, y)
    print "Model training done, taking ",time.time()-t0,"secs"
    pickle.dump(model,open(output_dir+"model.p",'wb')) # save model to disk
    
    '''
    #export to tree graph in DOT format, tree only
    tree.export_graphviz(model,out_file=output_dir+'tree.dot')
    os.system("dot -Tpng "+output_dir+"tree.dot -o "+output_dir+"tree.png")
    '''
    
    # Predict Train
    y_pred = model.predict(X)
    p_pred = model.predict_proba(X)
    p_pred = p_pred[:,1]
    
    # Predict Validation
    yv_pred = model.predict(Xv)
    pv_pred = model.predict_proba(Xv)
    pv_pred = pv_pred[:,1]
    
    # Performance Evaluation: Train and Validation
    ks, auc, lorenz_curve_capt_rate = performance_eval_train_validation(y,p_pred,yv,pv_pred,output_dir,output_suffix)
    
    
    #################### Random Forest Feature Importance ######################
    try:
        varlist_file=open(var_list_filename,'rU')
        varlist_csv=csv.reader(varlist_file)
        var_list=[]
        for row in varlist_csv:
            var_list.append(row[0])
        out_feat_import = open(output_dir + 'feature_import_' + str(output_suffix)+'.csv', 'wb')
        feat_import_csv = csv.writer(out_feat_import)
        var_import = zip(range(len(var_list)),var_list,model.feature_importances_)
        feat_import_csv.writerow(['var seq num','var name','importance'])
        print "RandomForest classifier, var importance was output"
        for row in var_import:
            feat_import_csv.writerow(row)
    except:
        print "Not RandomForest classifier, var importance not created"
    
    
    return ks, auc, lorenz_curve_capt_rate