def contract_view(): template = 'contract_view.html' object_list = data.load_data() districts_list = data.load_districts() products_list = data.dollars_by_product_service_code(object_list) recipient_city_list = data.contracts_by_city(object_list) return render_template(template, object_list=object_list, districts_list=districts_list, products_list=products_list, recipient_city_list=recipient_city_list)
def overview_data(): # form = searchForm(request.form) template = 'company_overview.html' object_list = data.load_data() products_list = data.dollars_by_product_service_code(object_list) recipient_city_list = data.contracts_by_city(object_list) return render_template(template, object_list=object_list, products_list=products_list, recipient_city_list=recipient_city_list)
def list(city_id): template = 'geo-list.html' object_list = data.load_data() cities_list = data.contracts_by_city(object_list) for c in cities_list: if c[5] == city_id: object_list = [o for o in object_list if c[5] == o["RecipientCity"]] return render_template(template, object_list=object_list) abort(404)
def contract_list(product_id): template = 'list.html' object_list = data.load_data() products_list = data.dollars_by_product_service_code(object_list) for p in products_list: if p[4] == product_id: object_list = [o for o in object_list if p[0] == o["ProductorServiceCode"]] return render_template(template, object_list=object_list) abort(404)
def check_agreement(model, variables): check_data = load_data('check_agreement') probs = model.predict_proba(check_data[variables])[:, 1] ks = evaluation.compute_ks( probs[check_data['signal'].values == 0], probs[check_data['signal'].values == 1], check_data[check_data['signal'] == 0]['weight'].values, check_data[check_data['signal'] == 1]['weight'].values) bot.info('KS metric %s %s' % (ks, ks < 0.09)) return ks
def main(CONFIG): """ The final model is a combination of several base models, which are then combined using StackedClassifier defined in the helpers.ml module. The list of models and associated datasets is generated automatically from their identifying strings. The format is as follows: A:b_c where A is the initials of the algorithm to use, b is the base dataset, and c is the feature set and the variants to use. """ SEED = 42 selected_models = [ "LR:tuples_sf", "LR:greedy_sfl", "LR:greedy2_sfl", "LR:greedy3_sf", "RFC:basic_b", "RFC:tuples_f", "RFC:tuples_fd", "RFC:greedy_f", "RFC:greedy2_f", "GBC:basic_f", "GBC:tuples_f", "LR:greedy_sbl", "GBC:greedy_c", "GBC:tuples_cf", #"RFC:effects_f", # experimental; added after the competition ] # Create the models on the fly models = [] for item in selected_models: model_id, dataset = item.split(':') model = {'LR': linear_model.LogisticRegression, 'GBC': ensemble.GradientBoostingClassifier, 'RFC': ensemble.RandomForestClassifier, 'ETC': ensemble.ExtraTreesClassifier}[model_id]() model.set_params(random_state=SEED) models.append((model, dataset)) datasets = [dataset for model, dataset in models] logger.info("loading data") y, X = load_data('train.csv') X_test = load_data('test.csv', return_labels=False) logger.info("preparing datasets (use_cache=%s)", str(CONFIG.use_cache)) create_datasets(X, X_test, y, datasets, CONFIG.use_cache) # Set params for model, feature_set in models: model.set_params(**ml.find_params(model, feature_set, y, grid_search=CONFIG.grid_search)) clf = ml.StackedClassifier( models, stack=CONFIG.stack, fwls=CONFIG.fwls, model_selection=CONFIG.model_selection, use_cached_models=CONFIG.use_cache) # Metrics logger.info("computing cv score") mean_auc = 0.0 for i in range(CONFIG.iter): train, cv = cross_validation.train_test_split( range(len(y)), test_size=.20, random_state=1+i*SEED) cv_preds = clf.fit_predict(y, train, cv, show_steps=CONFIG.verbose) fpr, tpr, _ = metrics.roc_curve(y[cv], cv_preds) roc_auc = metrics.auc(fpr, tpr) logger.info("AUC (fold %d/%d): %.5f", i + 1, CONFIG.iter, roc_auc) mean_auc += roc_auc if CONFIG.diagnostics and i == 0: # only plot for first fold logger.info("plotting learning curve") diagnostics.learning_curve(clf, y, train, cv) diagnostics.plot_roc(fpr, tpr) if CONFIG.iter: logger.info("Mean AUC: %.5f", mean_auc/CONFIG.iter) # Create submissions if CONFIG.outputfile: logger.info("making test submissions (CV AUC: %.4f)", mean_auc) preds = clf.fit_predict(y, show_steps=CONFIG.verbose) save_results(preds, CONFIG.outputfile + ".csv")
def check_correlation(model,variables): check_data = load_data('check_correlation') probs = model.predict_proba(check_data[variables])[:, 1] cvm = compute_cvm(probs, check_data['mass']) bot.info('CvM metric %s %s' %(cvm, cvm < 0.002)) return cvm
def main(CONFIG): """ The final model is a combination of several base models, which are then combined using StackedClassifier defined in the helpers.ml module. The list of models and associated datasets is generated automatically from their identifying strings. The format is as follows: A:b_c where A is the initials of the algorithm to use, b is the base dataset, and c is the feature set and the variants to use. """ SEED = 42 selected_models = [ "GBC:expanded", ] # Create the models on the fly models = [] for item in selected_models: model_id, dataset = item.split(':') model = { 'LR': linear_model.LogisticRegression, 'GBC': ensemble.GradientBoostingClassifier, 'RFC': ensemble.RandomForestClassifier, 'MLP': neural_network.MLPClassifier, 'ETC': ensemble.ExtraTreesClassifier }[model_id]() model.set_params(random_state=SEED) models.append((model, dataset)) #datasets = [dataset for model, dataset in models] datasets = ["basic", "residuals", "stats", "expanded"] logger.info("loading data") y, x = load_data('train.csv') x_test = load_data('test.csv', return_labels=False) logger.info("preparing datasets (use_cache=%s)", str(CONFIG.use_cache)) create_datasets(x, x_test, y, datasets, CONFIG.use_cache) # Set params for model, feature_set in models: model.set_params(**ml.find_params( model, feature_set, y, grid_search=CONFIG.grid_search)) clf = ml.StackedClassifier(models, stack=CONFIG.stack, fwls=CONFIG.fwls, model_selection=CONFIG.model_selection, use_cached_models=CONFIG.use_cache) # Results # Basic dataset #GBC:basic - 5 it: 0.54569 #LR:basic - 5 it: 0.49412 #Series dataset #GBC:stats - 5 it: 0.76338 #MLP:stats - 5 it: 0.62772 #GBC:stats" + RFC:stats - 5 it: 0.73487 #Expanded dataset # GBC:expanded - 5 it: 0.88390 # RFC:expanded - 5 it: 0.87114 # ETC:expanded - 5 it: 0.86704 # GBC + RFC:expanded - 5 it: 0.87741 # Metrics logger.info("computing cv score") mean_auc = 0.0 for i in range(CONFIG.iter): train, cv = model_selection.train_test_split(range(len(y)), test_size=.20, random_state=1 + i * SEED) cv_preds = clf.fit_predict(y, train, cv, show_steps=CONFIG.verbose) fpr, tpr, _ = metrics.roc_curve(y[cv], cv_preds) roc_auc = metrics.auc(fpr, tpr) logger.info("AUC (it %d/%d): %.5f", i + 1, CONFIG.iter, roc_auc) mean_auc += roc_auc if CONFIG.diagnostics and i == 0: # only plot for first it logger.info("plotting learning curve") clf.use_cached_models = False diagnostics.learning_curve(clf, y, train, cv, n=10) clf.use_cached_models = True diagnostics.plot_roc(fpr, tpr) if CONFIG.iter: logger.info("Mean AUC: %.5f", mean_auc / CONFIG.iter) # Create submissions if CONFIG.outputfile: logger.info("making test submissions (CV AUC: %.4f)", mean_auc) preds = clf.fit_predict(y, show_steps=CONFIG.verbose) save_results(preds, CONFIG.outputfile + ".csv")
#!/usr/bin/env python from sklearn.ensemble import GradientBoostingClassifier from helpers.data import load_data from helpers.logger import bot import pandas #bot.debug("Hello, here is a debug message!") #bot.debug("Hello, here is a warning message!") #bot.debug("Hello, here is an error message!") #bot.debug("Hello, here is an info message!") train = load_data(name="training") ## BUILD MODEL HERE #################################### # Baseline is provided as an example variables = ['LifeTime', 'FlightDistance', 'pt'] baseline = GradientBoostingClassifier(n_estimators=40, learning_rate=0.01, subsample=0.7, min_samples_leaf=10, max_depth=7, random_state=11) baseline.fit(train[variables], train['signal']) # MODEL TESTING ########################################
def clamp(x): return min(max(x, .00000001), .99999999) return np.vectorize(lambda x: -math.log((1 - clamp(x))/clamp(x)))(X) def print_param(obj, params, prefix=''): for param in params: if hasattr(obj, param): paramvalue = getattr(obj, param) if "coef" in param: paramvalue /= np.sum(paramvalue) print prefix + param + ": " + str(paramvalue) mean_prediction = 0.0 y = load_data('train.csv')[0] y = y[range(len(y) - 7770, len(y))] files = ["log75", "paul", "enp", "NB55", "NB45"] totransform = [] preds = [] for filename in files: with open("internal/new/final/%s.csv" % filename) as f: pred = np.loadtxt(f, delimiter=',', usecols=[1], skiprows=1) if filename in totransform: pred = inverse_transform(pred) preds.append(pred) X = np.array(preds).T standardizer = preprocessing.StandardScaler()
def city_contract(city_id, contract_id): template = 'contract.html' object_list = data.load_data() record = [o for o in object_list if o['record_count'] == contract_id] return render_template(template, object=record)