def setUpClass(cls):
     print('Setting up TestMakeDataset class...')
     main()
     cls.data = pd.read_csv(os.path.join(OUTPUT_DIR,
                                         "owid-energy-data.csv"))
     cls.codebook = pd.read_csv(
         os.path.join(OUTPUT_DIR, "owid-energy-codebook.csv"))
     cls.index_cols = ['country', 'year', 'iso_code']
def main():
    # create logger with 'my_application'
    logger = logging.getLogger('my_application')
    logger.setLevel(logging.DEBUG)
    # create file handler which logs even debug messages
    fh = logging.FileHandler('./myapp.log')
    fh.setLevel(logging.DEBUG)
    # create console handler with a higher log level
    ch = logging.StreamHandler()
    ch.setLevel(logging.ERROR)
    # create formatter and add it to the handlers
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)
    # add the handlers to the logger
    logger.addHandler(fh)
    logger.addHandler(ch)

    logger.info('Execution started')
    #make_dataset
    data_files_list_path = './data/interim/LCA_files_list.txt'
    input_df = make_dataset.main(data_files_list_path)

    #Ensure input_df is not empty
    assert input_df.shape[0] > 0, logger.exception('Input DataFrame is empty')

    logger.info('Dataset imported')
    #build_features
    X, y = build_features.main(input_df)
    logger.info('Features built, X shape %d %d, y shape %d', X.shape[0],
                X.shape[1], y.shape[0])

    #if mode==Train:
    #train_model
    logger.info('Model trained')
    #elif mode==Predict:
    #make_prediction
    logger.info('Prediction made')
    #explain_model_prediction
    logger.info('Model explained')
    module_logger.info('Building features complete.')

    #save pipeline when method is fit, fit_transform
    if method in ['fit', 'fit_transform']:
        dump(build_feature_pipe, open('./models/build_feature_pipe.pkl', 'wb'))
        dump(all_preprocess, open('./models/preprocess_pipe.pkl', 'wb'))
        module_logger.info('Pipeline saved.')

    return X, y


if __name__ == '__main__':
    import make_dataset
    from pickle import load

    data_files_list_path = './data/interim/LCA_files_list.txt'

    input_df = make_dataset.main(path=data_files_list_path,
                                 file_type='file_list')

    #For training the very first time
    #X,y = main(input_df, build_feature_pipe=None, all_preprocess=None, method='fit')
    #print(X.shape)
    #print(y.shape)

    #For iterative training
    build_feature_pipe = load(open('./models/build_feature_pipe.pkl', 'rb'))
    all_preprocess = load(open('./models/preprocess_pipe.pkl', 'rb'))
    X, y = main(input_df, build_feature_pipe, all_preprocess, method='inverse')
    print(X.shape)
    print(y.shape)
###############################################################################

# need to make a dataset directory where the new datasets will live
check_make('{0}/dataset'.format(job_directory))
# need to make a config directory where the oodles of new configs live
check_make('{0}/configs'.format(job_directory))
# need to make a logs directory to check out how my jobs performed
check_make('{0}/logs'.format(job_directory))

###############################################################################
# make fits dataset
###############################################################################

if args.dataset:
    from make_dataset import main
    main(job_directory + '/dataset/', **make_dataset_kwargs)

###############################################################################
# make wizard dataset (needed to determine number of jobs to submit)
###############################################################################

# edit config to only run make_directories
make_directories_config = config.copy()
make_directories_config['run'] = ['make_directories']
# execute config file with wizard
wizard(make_directories_config)

if args.dataset:
    # edit config to only run dataset
    dataset_config = config.copy()
    dataset_config['run'] = ['make_directories', 'dataset']
示例#5
0
        print('f1-score:', f1)
        module_logger.info('Model evaluation metrics: [%d,%d]', f1[0], f1[1])


if __name__ == '__main__':
    import make_dataset
    import build_features
    from pickle import load

    #data_files_list_path='./data/interim/LCA_files_list.txt'
    #input_df=make_dataset.main(path=data_files_list_path,file_type='file_list')

    #For training the very first time
    #'''
    data_path = './data/interim/LCA_dataset_sample10000.xlsx'
    input_df = make_dataset.main(path=data_path, file_type='data_file')
    X, y = build_features.main(input_df,
                               build_feature_pipe=None,
                               all_preprocess=None,
                               method='fit_transform')
    main(model=None, action='train', X=X, y=y)
    #'''

    #for incremental training of existing model
    '''
    data_path='./data/interim/LCA_dataset_sample1000.xlsx' 
    input_df=make_dataset.main(path=data_path,file_type='data_file')
    X,y = build_features.main(input_df, build_feature_pipe=None, all_preprocess=None, method='fit_transform')
    model=load(open('./models/adaboost_batch_train.pkl','rb'))
    main(model=model,action='train',X=X,y=y)
    '''