Пример #1
0
def main():
    config_file = args.path_to_config_file
    config, config_raw = configure_model(config_file)

    #If logging is enable, check that there are no records for
    #the selected experiment
    if not args.notlog:
        logger_uri = cfg_main['logger']['uri']
        logger_db = cfg_main['logger']['db']
        logger_collection = cfg_main['logger']['collection']
        mongo_logger = Logger(logger_uri, logger_db, logger_collection)
        if mongo_logger.experiment_exists(config['experiment_name']):
            raise ExperimentExists(config['experiment_name'])

    # datasets
    logger.info('Loading datasets...')
    train, test  = make_datasets(config)
    logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape,
        test.x.shape))
    # Dump datasets if dump option was selected
    if args.dump:
        logger.info('Dumping train and tests sets')
        datasets = [(train, 'train'), 
                    (test, 'test')]
        for data, name in datasets:
            if data is not None:
                filename = '{}_{}.csv'.format(config["experiment_name"], name)
                try:
                    #Try to convert to dataframe, it will fail if data is empty
                    df = data.to_df()
                except Exception, e:
                    logger.info('Error saving {} as csv: {}'.format(filename, e))
                finally:
                    df.to_csv(os.path.join(path_to_dumps, filename))
def get_config(model_id):
    """ Give a model_id (string or ObjectId), return the entire MongoDB log
        for that model.
    """

    logger = Logger(host=cfg['logger']['uri'],
                    db=cfg['logger']['db'],
                    collection=cfg['logger']['collection'])

    logging.info("Fetched model_id %s." % str(model_id))

    return logger.get_doc_from_id(model_id)
Пример #3
0
def log_results(model, config, test, predictions, feature_importances,
    imputer, scaler):
    '''
        Log results to a MongoDB database
    '''
    #Instantiate logger
    logger_uri = cfg_main['logger']['uri']
    logger_db = cfg_main['logger']['db']
    logger_collection = cfg_main['logger']['collection']
    mongo_logger = Logger(logger_uri, logger_db, logger_collection)
    #Compute some statistics to log
    prec_at_1, cutoff_at_1 = precision_at(test.y, predictions, 0.01)
    prec_at_10, cutoff_at_10 = precision_at(test.y, predictions, 0.1)
    #Add the name of the experiment if available
    experiment_name = config["experiment_name"] if config["experiment_name"] else None
    #Sending model will log model name, parameters and datetime
    #Also log other important things by sending named parameters
    mongo_id = mongo_logger.log_model(model, features=list(test.feature_names),
        feature_importances=list(feature_importances),
        config=config, prec_at_1=prec_at_1,
        prec_at_10=prec_at_10, cutoff_at_1=cutoff_at_1,
        cutoff_at_10=cutoff_at_10, experiment_name=experiment_name,
        feature_mapping=test.feature_mapping)

    #Dump test_labels, test_predictions and test_parcels to a csv file
    parcel_id = [record[0] for record in test.parcels]
    inspection_date = [record[1] for record in test.parcels]
    dump = pd.DataFrame({'parcel_id': parcel_id,
        'inspection_date': inspection_date,
        'viol_outcome': test.y,
        'prediction': predictions})
    #Dump predictions to CSV
    dump.to_csv(os.path.join(path_to_predictions, mongo_id))
    #Pickle model
    if args.pickle:
        path_to_file = os.path.join(path_to_pickled_models, mongo_id)
        logger.info('Pickling model: {}'.format(path_to_file))
        joblib.dump(model, path_to_file)

        path_to_file = os.path.join(path_to_pickled_imputers, mongo_id)
        logger.info('Pickling imputer: {}'.format(path_to_file))
        joblib.dump(imputer, path_to_file)

        path_to_file = os.path.join(path_to_pickled_scalers, mongo_id)
        logger.info('Pickling scaler: {}'.format(path_to_file))
        joblib.dump(scaler, path_to_file)
Пример #4
0
def main():
    config_file = args.path_to_config_file
    config, config_raw = configure_model(config_file)

    #If logging is enabled, check that there are no records for
    #the selected experiment
    if not args.notlog:
        logger_uri = cfg_main['logger']['uri']
        logger_db = cfg_main['logger']['db']
        logger_collection = cfg_main['logger']['collection']
        mongo_logger = Logger(logger_uri, logger_db, logger_collection)
        if mongo_logger.experiment_exists(config['experiment_name']):
            raise ExperimentExists(config['experiment_name'])

    # datasets
    logger.info('Loading datasets...')
    train, test  = make_datasets(config)
    logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape,
        test.x.shape))

    #Check percentage of NAs for every feature,
    #raise an error if at least one feature has more NAs than the
    #acceptable threshold
    logger.info('Checking training set NAs...')
    prop = check_nas_threshold(train.to_df(), NAS_PROPORTION_THRESHOLD)
    logger.debug(prop)
    logger.info('Checking testing set NAs...')
    prop = check_nas_threshold(test.to_df(), NAS_PROPORTION_THRESHOLD)
    logger.debug(prop)

    # Dump datasets if dump option was selected
    if args.dump:
        logger.info('Dumping train and tests sets')
        datasets = [(train, 'train'), 
                    (test, 'test')]
        for data, name in datasets:
            if data is not None:
                filename = '{}_{}.csv'.format(config["experiment_name"], name)
                try:
                    #Try to convert to dataframe, it will fail if data is empty
                    df = data.to_df()
                except Exception, e:
                    logger.info('Error saving {} as csv: {}'.format(filename, e))
                finally:
                    df.to_csv(os.path.join(path_to_dumps, filename))
Пример #5
0
def log_results(model, config, test, predictions, feature_importances,
                imputer, scaler):
    '''
        Log results to a MongoDB database
    '''
    # Instantiate logger
    logger_uri = cfg_main['logger']['uri']
    logger_db = cfg_main['logger']['db']
    logger_collection = cfg_main['logger']['collection']
    mongo_logger = Logger(logger_uri, logger_db, logger_collection)
    # Compute some statistics to log
    prec_at_1, cutoff_at_1 = precision_at(test.y, predictions, 0.01)
    prec_at_5, cutoff_at_5 = precision_at(test.y, predictions, 0.05)
    prec_at_10, cutoff_at_10 = precision_at(test.y, predictions, 0.1)
    prec_at_20, cutoff_at_20 = precision_at(test.y, predictions, 0.2)

    # Add the name of the experiment if available
    experiment_name = (config["experiment_name"] if config["experiment_name"]
                       else None)
    # Sending model will log model name, parameters and datetime
    # Also log other important things by sending named parameters

    ft_imp = list(feature_importances)
    ft_map = test.feature_mapping

    mongo_id = mongo_logger.log_model(model,
                                      features=list(test.feature_names),
                                      feature_importances=ft_imp,
                                      config=config,
                                      prec_at_1=prec_at_1,
                                      cutoff_at_1=cutoff_at_1,
                                      prec_at_5=prec_at_5,
                                      cutoff_at_5=cutoff_at_5,
                                      prec_at_10=prec_at_10,
                                      cutoff_at_10=cutoff_at_10,
                                      prec_at_20=prec_at_20,
                                      cutoff_at_20=cutoff_at_20,
                                      experiment_name=experiment_name,
                                      feature_mapping=ft_map)

    # Dump test_labels, test_predictions and test_parcels to a csv file
    parcel_id = [record[0] for record in test.parcels]
    inspection_date = [record[1] for record in test.parcels]
    dump = pd.DataFrame({'parcel_id': parcel_id,
                         'inspection_date': inspection_date,
                         'viol_outcome': test.y,
                         'prediction': predictions})
    # Dump predictions to CSV
    dump.to_csv(os.path.join(path_to_predictions, mongo_id))
    # Pickle model
    if args.pickle:
        path_to_file = os.path.join(path_to_pickled_models, mongo_id)
        logger.info('Pickling model: {}'.format(path_to_file))
        joblib.dump(model, path_to_file)

        path_to_file = os.path.join(path_to_pickled_imputers, mongo_id)
        logger.info('Pickling imputer: {}'.format(path_to_file))
        joblib.dump(imputer, path_to_file)

        path_to_file = os.path.join(path_to_pickled_scalers, mongo_id)
        logger.info('Pickling scaler: {}'.format(path_to_file))
        joblib.dump(scaler, path_to_file)
Пример #6
0
# top_percent - percent taken from the top
# used for random sampling
percent = 4
# n_inspections - list length
n_inspections = 1000
# nei_schema - db schema used to look for the neighborhood score
# table
nei_schema = 'features_01may2015'
# nei_table - db table used to get the neighborhood scores from
nei_table = 'neighborhood_score_500m_6months'

# step 1 - load model

# load information for the best model
logger = Logger(host=main['logger']['uri'],
                db=main['logger']['db'],
                collection=main['logger']['collection'])
model = logger.get_best_from_experiment(exp_name, metric)

# training window
train_start = datetime.strptime(model['config']['start_date'], '%d%b%Y')
train_end = datetime.strptime(model['config']['fake_today'], '%d%b%Y')
# change the format to match the one needed for the db
train_start = train_start.strftime('%Y-%m-%d')
train_end = train_end.strftime('%Y-%m-%d')
today = datetime.utcnow().strftime('%Y-%m-%d')

# step 2 - rank all parcels in cincinnati

# use model to predict on every parcel in a given list
preds = predict_on_schema(str(model['_id']), schema)
def main():

    if args.warninglog:
        myhandler = logging.FileHandler(os.path.abspath(args.warninglog))
        myhandler.setLevel('WARNING')
        logger.addHandler(myhandler)
    if args.debuglog:
        myhandler2 = logging.FileHandler(os.path.abspath(args.debuglog))
        myhandler2.setLevel('DEBUG')
        logger.addHandler(myhandler2)

    config_file = args.path_to_config_file
    config, config_raw = configure_model(config_file)

    if args.predicttop and args.notlog:
        raise ValueError("You cannot save the top X predictions "
                "on all parcels without also logging.")

    #If logging is enabled, check that there are no records for
    #the selected experiment
    if not args.notlog:

        logger_uri = cfg_main['logger']['uri']
        logger_db = cfg_main['logger']['db']
        logger_collection = cfg_main['logger']['collection']
        mongo_logger = Logger(logger_uri, logger_db, logger_collection)

        if mongo_logger.experiment_exists(config['experiment_name']):

            # if the user hasn't selected to overwrite the record, throw error
            if not args.overwritelog:
                raise ExperimentExists(config['experiment_name'])
            else:
                mongo_logger.delete_experiment(config['experiment_name'])

    # datasets
    logger.info('Loading datasets...')
    if not args.predicttop:
        train, test  = make_datasets(config, predictset=False)
        logger.debug('Train x shape: {} Test x shape: {}'.format(train.x.shape,
            test.x.shape))
    else:
        train, test, preds = make_datasets(config, predictset=True)
        logger.debug('Train x shape: {} Test x shape: {} Prediction x shape {}'\
                .format(train.x.shape, test.x.shape, preds.x.shape))

    #Check percentage of NAs for every feature,
    #raise an error if at least one feature has more NAs than the
    #acceptable threshold
    logger.info('Checking training set NAs...')
    prop = check_nas_threshold(train.to_df(), NAS_PROPORTION_THRESHOLD)
    logger.debug(prop)
    logger.info('Checking testing set NAs...')
    prop = check_nas_threshold(test.to_df(), NAS_PROPORTION_THRESHOLD)
    logger.debug(prop)

    # Dump datasets if dump option was selected
    if args.dump:
        logger.info('Dumping train and tests sets')
        datasets = [(train, 'train'), 
                    (test, 'test')]
        if args.predicttop:
            logger.info('Dumping prediction sets')
            datasets.append((preds, 'prediction'))
        for data, name in datasets:
            if data is not None:
                filename = '{}_{}.csv'.format(config["experiment_name"], name)
                try:
                    #Try to convert to dataframe, it will fail if data is empty
                    df = data.to_df()
                    df.to_csv(os.path.join(path_to_dumps, filename))
                except Exception, e:
                    logger.info('Error saving {} as csv: {}'.format(filename, e))
            else:
                logger.info('{} is None, skipping dump...'.format(name))
from lib_cinci.config import load
import os
import yaml
import itertools

folder = os.environ['ROOT_FOLDER']
output_folder = os.environ['OUTPUT_FOLDER']
path_to_output = os.path.join(output_folder, 'model_groups.csv')

name = 'config.yaml'
path = "%s/%s" % (folder, name)
f = open(path, 'r')
text = f.read()
main = yaml.load(text)
logger = Logger(host=main['logger']['uri'],
                db=main['logger']['db'],
                collection=main['logger']['collection'])

connparams = load('config.yaml')['db']
uri = '{dialect}://{user}:{password}@{host}:{port}/{database}'.format(
    **connparams)
libpq_uri = 'dbname={database} user={user} host={host} password={password} port={port}'.\
                format(**connparams)

engine = create_engine(uri)

query = 'SELECT * FROM model_results.all_models;'
all_models = pd.read_sql(query, engine, index_col='model_id')
engine.dispose()

# add group number to all models