def main(): config_file = args.path_to_config_file config, config_raw = configure_model(config_file) #If logging is enable, check that there are no records for #the selected experiment if not args.notlog: logger_uri = cfg_main['logger']['uri'] logger_db = cfg_main['logger']['db'] logger_collection = cfg_main['logger']['collection'] mongo_logger = Logger(logger_uri, logger_db, logger_collection) if mongo_logger.experiment_exists(config['experiment_name']): raise ExperimentExists(config['experiment_name']) # datasets logger.info('Loading datasets...') train, test = make_datasets(config) logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape, test.x.shape)) # Dump datasets if dump option was selected if args.dump: logger.info('Dumping train and tests sets') datasets = [(train, 'train'), (test, 'test')] for data, name in datasets: if data is not None: filename = '{}_{}.csv'.format(config["experiment_name"], name) try: #Try to convert to dataframe, it will fail if data is empty df = data.to_df() except Exception, e: logger.info('Error saving {} as csv: {}'.format(filename, e)) finally: df.to_csv(os.path.join(path_to_dumps, filename))
def get_config(model_id): """ Give a model_id (string or ObjectId), return the entire MongoDB log for that model. """ logger = Logger(host=cfg['logger']['uri'], db=cfg['logger']['db'], collection=cfg['logger']['collection']) logging.info("Fetched model_id %s." % str(model_id)) return logger.get_doc_from_id(model_id)
def log_results(model, config, test, predictions, feature_importances, imputer, scaler): ''' Log results to a MongoDB database ''' #Instantiate logger logger_uri = cfg_main['logger']['uri'] logger_db = cfg_main['logger']['db'] logger_collection = cfg_main['logger']['collection'] mongo_logger = Logger(logger_uri, logger_db, logger_collection) #Compute some statistics to log prec_at_1, cutoff_at_1 = precision_at(test.y, predictions, 0.01) prec_at_10, cutoff_at_10 = precision_at(test.y, predictions, 0.1) #Add the name of the experiment if available experiment_name = config["experiment_name"] if config["experiment_name"] else None #Sending model will log model name, parameters and datetime #Also log other important things by sending named parameters mongo_id = mongo_logger.log_model(model, features=list(test.feature_names), feature_importances=list(feature_importances), config=config, prec_at_1=prec_at_1, prec_at_10=prec_at_10, cutoff_at_1=cutoff_at_1, cutoff_at_10=cutoff_at_10, experiment_name=experiment_name, feature_mapping=test.feature_mapping) #Dump test_labels, test_predictions and test_parcels to a csv file parcel_id = [record[0] for record in test.parcels] inspection_date = [record[1] for record in test.parcels] dump = pd.DataFrame({'parcel_id': parcel_id, 'inspection_date': inspection_date, 'viol_outcome': test.y, 'prediction': predictions}) #Dump predictions to CSV dump.to_csv(os.path.join(path_to_predictions, mongo_id)) #Pickle model if args.pickle: path_to_file = os.path.join(path_to_pickled_models, mongo_id) logger.info('Pickling model: {}'.format(path_to_file)) joblib.dump(model, path_to_file) path_to_file = os.path.join(path_to_pickled_imputers, mongo_id) logger.info('Pickling imputer: {}'.format(path_to_file)) joblib.dump(imputer, path_to_file) path_to_file = os.path.join(path_to_pickled_scalers, mongo_id) logger.info('Pickling scaler: {}'.format(path_to_file)) joblib.dump(scaler, path_to_file)
def main(): config_file = args.path_to_config_file config, config_raw = configure_model(config_file) #If logging is enabled, check that there are no records for #the selected experiment if not args.notlog: logger_uri = cfg_main['logger']['uri'] logger_db = cfg_main['logger']['db'] logger_collection = cfg_main['logger']['collection'] mongo_logger = Logger(logger_uri, logger_db, logger_collection) if mongo_logger.experiment_exists(config['experiment_name']): raise ExperimentExists(config['experiment_name']) # datasets logger.info('Loading datasets...') train, test = make_datasets(config) logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape, test.x.shape)) #Check percentage of NAs for every feature, #raise an error if at least one feature has more NAs than the #acceptable threshold logger.info('Checking training set NAs...') prop = check_nas_threshold(train.to_df(), NAS_PROPORTION_THRESHOLD) logger.debug(prop) logger.info('Checking testing set NAs...') prop = check_nas_threshold(test.to_df(), NAS_PROPORTION_THRESHOLD) logger.debug(prop) # Dump datasets if dump option was selected if args.dump: logger.info('Dumping train and tests sets') datasets = [(train, 'train'), (test, 'test')] for data, name in datasets: if data is not None: filename = '{}_{}.csv'.format(config["experiment_name"], name) try: #Try to convert to dataframe, it will fail if data is empty df = data.to_df() except Exception, e: logger.info('Error saving {} as csv: {}'.format(filename, e)) finally: df.to_csv(os.path.join(path_to_dumps, filename))
def log_results(model, config, test, predictions, feature_importances, imputer, scaler): ''' Log results to a MongoDB database ''' # Instantiate logger logger_uri = cfg_main['logger']['uri'] logger_db = cfg_main['logger']['db'] logger_collection = cfg_main['logger']['collection'] mongo_logger = Logger(logger_uri, logger_db, logger_collection) # Compute some statistics to log prec_at_1, cutoff_at_1 = precision_at(test.y, predictions, 0.01) prec_at_5, cutoff_at_5 = precision_at(test.y, predictions, 0.05) prec_at_10, cutoff_at_10 = precision_at(test.y, predictions, 0.1) prec_at_20, cutoff_at_20 = precision_at(test.y, predictions, 0.2) # Add the name of the experiment if available experiment_name = (config["experiment_name"] if config["experiment_name"] else None) # Sending model will log model name, parameters and datetime # Also log other important things by sending named parameters ft_imp = list(feature_importances) ft_map = test.feature_mapping mongo_id = mongo_logger.log_model(model, features=list(test.feature_names), feature_importances=ft_imp, config=config, prec_at_1=prec_at_1, cutoff_at_1=cutoff_at_1, prec_at_5=prec_at_5, cutoff_at_5=cutoff_at_5, prec_at_10=prec_at_10, cutoff_at_10=cutoff_at_10, prec_at_20=prec_at_20, cutoff_at_20=cutoff_at_20, experiment_name=experiment_name, feature_mapping=ft_map) # Dump test_labels, test_predictions and test_parcels to a csv file parcel_id = [record[0] for record in test.parcels] inspection_date = [record[1] for record in test.parcels] dump = pd.DataFrame({'parcel_id': parcel_id, 'inspection_date': inspection_date, 'viol_outcome': test.y, 'prediction': predictions}) # Dump predictions to CSV dump.to_csv(os.path.join(path_to_predictions, mongo_id)) # Pickle model if args.pickle: path_to_file = os.path.join(path_to_pickled_models, mongo_id) logger.info('Pickling model: {}'.format(path_to_file)) joblib.dump(model, path_to_file) path_to_file = os.path.join(path_to_pickled_imputers, mongo_id) logger.info('Pickling imputer: {}'.format(path_to_file)) joblib.dump(imputer, path_to_file) path_to_file = os.path.join(path_to_pickled_scalers, mongo_id) logger.info('Pickling scaler: {}'.format(path_to_file)) joblib.dump(scaler, path_to_file)
# top_percent - percent taken from the top # used for random sampling percent = 4 # n_inspections - list length n_inspections = 1000 # nei_schema - db schema used to look for the neighborhood score # table nei_schema = 'features_01may2015' # nei_table - db table used to get the neighborhood scores from nei_table = 'neighborhood_score_500m_6months' # step 1 - load model # load information for the best model logger = Logger(host=main['logger']['uri'], db=main['logger']['db'], collection=main['logger']['collection']) model = logger.get_best_from_experiment(exp_name, metric) # training window train_start = datetime.strptime(model['config']['start_date'], '%d%b%Y') train_end = datetime.strptime(model['config']['fake_today'], '%d%b%Y') # change the format to match the one needed for the db train_start = train_start.strftime('%Y-%m-%d') train_end = train_end.strftime('%Y-%m-%d') today = datetime.utcnow().strftime('%Y-%m-%d') # step 2 - rank all parcels in cincinnati # use model to predict on every parcel in a given list preds = predict_on_schema(str(model['_id']), schema)
def main(): if args.warninglog: myhandler = logging.FileHandler(os.path.abspath(args.warninglog)) myhandler.setLevel('WARNING') logger.addHandler(myhandler) if args.debuglog: myhandler2 = logging.FileHandler(os.path.abspath(args.debuglog)) myhandler2.setLevel('DEBUG') logger.addHandler(myhandler2) config_file = args.path_to_config_file config, config_raw = configure_model(config_file) if args.predicttop and args.notlog: raise ValueError("You cannot save the top X predictions " "on all parcels without also logging.") #If logging is enabled, check that there are no records for #the selected experiment if not args.notlog: logger_uri = cfg_main['logger']['uri'] logger_db = cfg_main['logger']['db'] logger_collection = cfg_main['logger']['collection'] mongo_logger = Logger(logger_uri, logger_db, logger_collection) if mongo_logger.experiment_exists(config['experiment_name']): # if the user hasn't selected to overwrite the record, throw error if not args.overwritelog: raise ExperimentExists(config['experiment_name']) else: mongo_logger.delete_experiment(config['experiment_name']) # datasets logger.info('Loading datasets...') if not args.predicttop: train, test = make_datasets(config, predictset=False) logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape, test.x.shape)) else: train, test, preds = make_datasets(config, predictset=True) logger.debug('Train x shape: {} Test x shape: {} Prediction x shape {}'\ .format(train.x.shape, test.x.shape, preds.x.shape)) #Check percentage of NAs for every feature, #raise an error if at least one feature has more NAs than the #acceptable threshold logger.info('Checking training set NAs...') prop = check_nas_threshold(train.to_df(), NAS_PROPORTION_THRESHOLD) logger.debug(prop) logger.info('Checking testing set NAs...') prop = check_nas_threshold(test.to_df(), NAS_PROPORTION_THRESHOLD) logger.debug(prop) # Dump datasets if dump option was selected if args.dump: logger.info('Dumping train and tests sets') datasets = [(train, 'train'), (test, 'test')] if args.predicttop: logger.info('Dumping prediction sets') datasets.append((preds, 'prediction')) for data, name in datasets: if data is not None: filename = '{}_{}.csv'.format(config["experiment_name"], name) try: #Try to convert to dataframe, it will fail if data is empty df = data.to_df() df.to_csv(os.path.join(path_to_dumps, filename)) except Exception, e: logger.info('Error saving {} as csv: {}'.format(filename, e)) else: logger.info('{} is None, skipping dump...'.format(name))
from lib_cinci.config import load import os import yaml import itertools folder = os.environ['ROOT_FOLDER'] output_folder = os.environ['OUTPUT_FOLDER'] path_to_output = os.path.join(output_folder, 'model_groups.csv') name = 'config.yaml' path = "%s/%s" % (folder, name) f = open(path, 'r') text = f.read() main = yaml.load(text) logger = Logger(host=main['logger']['uri'], db=main['logger']['db'], collection=main['logger']['collection']) connparams = load('config.yaml')['db'] uri = '{dialect}://{user}:{password}@{host}:{port}/{database}'.format( **connparams) libpq_uri = 'dbname={database} user={user} host={host} password={password} port={port}'.\ format(**connparams) engine = create_engine(uri) query = 'SELECT * FROM model_results.all_models;' all_models = pd.read_sql(query, engine, index_col='model_id') engine.dispose() # add group number to all models