def model_load(prefix='sl', data_dir=None, training=True): """ example funtion to load model The prefix allows the loading of different models """ if not data_dir: # data_dir = os.path.join("..","data","cs-train") data_dir = os.path.join(os.getcwd(), "data", "cs-train") models = [ f for f in os.listdir(os.path.join(".", "models")) if re.search("sl", f) ] if len(models) == 0: raise Exception( "Models with prefix '{}' cannot be found did you train?".format( prefix)) all_models = {} for model in models: all_models[re.split("-", model)[1]] = joblib.load( os.path.join(".", "models", model)) ## load data ts_data = fetch_ts(data_dir) all_data = {} for country, df in ts_data.items(): X, y, dates = engineer_features(df, training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X": X, "y": y, "dates": dates} return (all_data, all_models)
def model_train(data_dir, test=False): """ funtion to train model given a df 'mode' - can be used to subset data essentially simulating a train """ if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) if test: print("... test flag on") print("...... subseting data") print("...... subseting countries") ## fetch time-series formatted data ts_data = fetch_ts(data_dir) ## train a different model for each data sets for country, df in ts_data.items(): if test and country not in ['all', 'united_kingdom']: continue _model_train(df, country, test=test)
def model_predict(country, year, month, day): time_start = time.time() data_dir = os.path.join("data", "cs_train", "data") ts_data = fetch_ts(data_dir) countries = [] for c, df in ts_data.items(): countries.append(c) if (country not in countries): text = "Could not find country called " + country return (text) else: filename = "./data/forecasts/forecast_" + country forecasts = pd.read_csv(filename) date_str = year + "-" + month + "-" + day row = forecasts.loc[forecasts['ds'] == date_str] if (len(row) == 0): return "Date not available" else: # update the log file m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) test = False update_predict_log(row.yhat.values[0], runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test) return row
def model_load(prefix='sl', data_dir=None, training=True): if not data_dir: data_dir = os.path.join("..", "capstone-w", "cs-train") models = [ f for f in os.listdir(os.path.join(".", "models")) if re.search("sl", f) ] if len(models) == 0: raise Exception( "Models with prefix '{}' cannot be found did you train?".format( prefix)) all_models = {} for model in models: all_models[re.split("-", model)[1]] = joblib.load( os.path.join(".", "models", model)) # load data ts_data = fetch_ts(data_dir) all_data = {} for country, df in ts_data.items(): df = clean_data(df) X, y, dates = engineer_features(df, training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X": X, "y": y, "dates": dates} return (all_data, all_models)
def model_train(): ## start timer for runtime time_start = time.time() data_dir = os.path.join("data", "cs_train", "data") ts_data = fetch_ts(data_dir) for country, df in ts_data.items(): m = Prophet() df2 = df[["date", "revenue"]] df2.columns = ['ds', 'y'] m.fit(df2) future = m.make_future_dataframe(periods=120) forecast = m.predict(future) forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail() filename = "data/forecasts/forecast_" + country forecast.to_csv(filename) ## update the log file m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) test = False update_train_log(forecast.shape, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test) return True
def model_train(data_dir, test=False, country=None): """ funcion to train model given a df 'mode' - can be used to subset data essentially simulating a train """ ## create storage folder if needed if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) ## on test, use only two countries and small fraction of data if test: print("... test flag on") print("...... subseting data") print("...... subseting countries") ## fetch time-series formatted data ts_data = fetch_ts(data_dir, country=country, clean=False) ## train a different model for each data sets ## for test, only train "all" and "UK" for country, df in ts_data.items(): if test and country not in ['all', 'united_kingdom']: continue ## train specific model _model_train(df, country, test=test)
def get_latest_train_data(country): """ load the data used in the latest training """ data_dir = os.path.join("data","cs_train","data") ts_data = fetch_ts(data_dir) for c,df in ts_data.items(): if(c==country): return df
def model_load(prefix='sl', data_dir=None, training=True, country=None): """ example function to load model The prefix allows the loading of different models """ ## if data path not specified, use generic if not data_dir: data_dir = os.path.join(".", "data", "cs-train") ## load all models (or filter for country) if country is None: models = [ f for f in os.listdir(os.path.join(".", "models")) if re.search(prefix, f) ] else: country_id = re.sub("\s+", "_", country.lower()) models = [ f for f in os.listdir(os.path.join(".", "models")) if (re.search(prefix, f) and re.search(country_id, f)) ] if len(models) == 0: if country is None: raise Exception( "Models with prefix '{}' cannot be found, did you train?". format(prefix)) else: raise Exception( "Model for '{0}' with predix '{1}' cannot be found, did you train it?" .format(prefix, country)) ## store model in dictionary ## key = model name ## value = model all_models = {} for model in models: all_models[re.split("-", model)[1]] = joblib.load( os.path.join(".", "models", model)) ## load data ts_data = fetch_ts(data_dir, country=country) all_data = {} for country, df in ts_data.items(): X, y, dates = engineer_features(df, training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X": X, "y": y, "dates": dates} return (all_data, all_models)
def ingest(): """ basic re-ingest method generating ts data from the original invoicing data and rebuild the model """ print('### API ingest entering ###') data_dir = os.path.join("data", "cs-train") print("...fetching data") ts_all = fetch_ts(data_dir, clean=True) for key, item in ts_all.items(): print(key, item.shape) print("... re-ingesting complete") return (jsonify(True))
def model_train(data_dir, test=False): ## multiple models were compared if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) if test: print("... test flag on") print("...... subseting data") print("...... subseting countries") ts_data = fetch_ts(data_dir) for country, df in ts_data.items(): if test and country not in ['all', 'united_kingdom']: continue _model_train(df, country, test=test)
def model_train(data_dir, test=False): if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) if test: print("... test flag on") print("...... subsetting data") print("...... subsetting countries") # fetch time-series formatted data ts_data = fetch_ts(data_dir) # train a different model for each data sets for country, df in ts_data.items(): if test and country not in ['all', 'united_kingdom']: continue _model_train(df, country, test=test)
def model_load(prefix='sl', data_dir=None, training=True): """ example funtion to load model The prefix allows the loading of different models """ if not data_dir: data_dir = os.path.join(PARENT_DIR, "Final_Capstone/cs-train") all_models = model_load_only(prefix=prefix) ## load data ts_data = fetch_ts(data_dir) all_data = {} for country, df in ts_data.items(): X, y, dates = engineer_features(df, training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X": X, "y": y, "dates": dates} return (all_data, all_models)
def model_train(prefix='sl', data_dir=DATA_DIR, test=False, countries=False, model=DEFAULT_MODEL, model_param_grid=DEFAULT_PARAM_GRID, scaler=DEFAULT_SCALER): """ funtion to train model given a df 'mode' - can be used to subset data essentially simulating a train """ if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) if test: print("... test flag on") print("...... subseting data") print("...... subseting countries") ## fetch time-series formatted data ts_data = fetch_ts(data_dir) ## train a different model for each data sets for country, df in ts_data.items(): # only train model for all and uk in test mode if test and country not in ['all', 'united_kingdom']: continue # only train model for country in countries if countries and not (country in countries): continue _model_train(prefix, df, country, test=test, model=model, model_param_grid=model_param_grid, scaler=scaler)
def model_load(country, prefix='sl', data_dir=None, training=True): """ example funtion to load model The prefix allows the loading of different models """ warnings.filterwarnings("ignore") if not data_dir: data_dir = os.path.join(DATA_DIR) # country when passed will load that country's model. 'all' will all models model_name = prefix + '-' + country models = [ f for f in os.listdir(os.path.join(MODEL_DIR)) if re.search(model_name, f) ] if len(models) == 0: raise Exception( "Models with prefix '{}' cannot be found did you train?".format( prefix)) all_models = {} for model in models: all_models[re.split("-", model)[1]] = joblib.load( os.path.join(MODEL_DIR, model)) # load data ts_data = fetch_ts(data_dir) all_data = {} for country, df in ts_data.items(): X, y, dates = engineer_features(df, training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X": X, "y": y, "dates": dates} return (all_data, all_models)
mask = np.arange(X.shape[0]) < np.arange(X.shape[0])[-30] X = X[mask] y = y[mask] dates = dates[mask] X.reset_index(drop=True, inplace=True) return (X, y, dates) if __name__ == "__main__": run_start = time.time() data_dir = os.path.join("..", "data", "cs-train") print("...fetching data") ts_all = fetch_ts(data_dir, clean=False) m, s = divmod(time.time() - run_start, 60) h, m = divmod(m, 60) print("load time:", "%d:%02d:%02d" % (h, m, s)) for key, item in ts_all.items(): print(key, item.shape) ## the data ingestion exists as a function or script to facilitate automation MODEL_DIR = "models" MODEL_VERSION = 0.1 MODEL_VERSION_NOTE = "supervised learing model for time-series"