def main(targets): if 'test-project' in targets: cfg = load_params(TEST_PARAMS) prep_dir(**cfg) get_data(**cfg) build_features(**cfg) reduce_apis() node2vec_main(**cfg) word2vec_main(**cfg) metapath2vec_main(**cfg) return if 'data' in targets: cfg = load_params(DATA_PARAMS) elif 'data-test' in targets: cfg = load_params(TEST_PARAMS) else: return prep_dir(**cfg) if 'clean' in targets: clean_raw(**cfg) clean_features(**cfg) clean_processed(**cfg) return if 'ingest' in targets: get_data(**cfg) if 'process' in targets: build_features(**cfg) if 'reduce' in targets: reduce_apis() if 'node2vec' in targets: node2vec_main(**cfg) if 'word2vec' in targets: word2vec_main(**cfg) if 'model' in targets: hindroid.run(**cfg) return
def parameterized_test(self, model, mode): # given: data_dir = "test-data" interim_dir = self.test_dir + "/interim" processed_dir = self.test_dir + "/processed" model_dir = self.test_dir + "/model" model_path = model_dir + ("" if mode == "full" else "_" + mode) + "/0001.txt" submission_dir = self.test_dir + "/submissions" submission_path = submission_dir + "/submission.csv" # data preparation # when: make_dataset(data_dir, interim_dir) # then: self.assertTrue(os.path.exists(interim_dir + "/test_data.pkl")) self.assertTrue(os.path.exists(interim_dir + "/test_data.pkl")) # feature engineering # when: build_features(data_dir, processed_dir) # then: self.assertTrue(os.path.exists(processed_dir + "/test_data.pkl")) self.assertTrue(os.path.exists(processed_dir + "/test_data.pkl")) # model training # when: train_model(model, mode, processed_dir, model_dir) # then: self.assertTrue(os.path.exists(model_path)) # model prediction # when: predict_model(processed_dir, model, model_path, submission_path) # then: self.assertTrue(os.path.exists(submission_path))
def main(targets): # make the data target if 'data' in targets: cfg = load_params(DATA_PARAMS) elif 'data-test' in targets: cfg = load_params(TEST_PARAMS) elif 'test-project' in targets: cfg = load_params(TEST_PARAMS) prep_dir(**cfg) clean_raw(**cfg) clean_features(**cfg) clean_processed(**cfg) prep_dir(**cfg) get_data(**cfg) build_features(**cfg) hindroid.run(**cfg) return else: return prep_dir(**cfg) # make the clean target if 'clean' in targets: clean_raw(**cfg) clean_features(**cfg) clean_processed(**cfg) return # make the data target if 'ingest' in targets: get_data(**cfg) if 'process' in targets: build_features(**cfg) if 'model' in targets: hindroid.run(**cfg) return
def train(self, stock_df, label='label', test_percent=0.2): # data cleansing clean_stock_df = stock_df.dropna(inplace=False) # deriving features and outcome from the data to be trained upon self.x = build_features(clean_stock_df, label) self.y = outcome(clean_stock_df, label) # split data into training and test data set x_train, x_test, y_train, y_test = train_test_split( self.x, self.y, test_size=test_percent) # choosing the classifier and fit the parameters for the training dataset self.classifier.fit(x_train, y_train) # now we calculate the accuracy of our linear model print(f'Accuracy: {self.classifier.score(x_test, y_test)}')
def builder(): """ Will be called to create all the models, after processing the dataset and building the features, which will then be accessed during usage of api.py """ print("making dataset...") logger.info("making dataset...") df = make_dataset() print("build features...") logger.info("build features...") processed_data_with_features = build_features(df, True) print("train model...") logger.info("train model...") execute_models(processed_data_with_features) print("done") logger.info("done")
def predict_model(url): """ Retuns a list containing the result of the prediction The list shows the url, model, precition score, and confidence score. Args: url: A string Retuns: A list with 4 keys: url, model, prediction_score, confidence_score """ df = ['' + url] df = pd.DataFrame(df, columns=['url']) dataframe = build_features(df, False) domain = dataframe['domain'][0] features = dataframe.to_dict(orient="records")[0] dataframe = model_utils.load_data(dataframe) # Find the prediction score and condience score of the model random_forest_pred_score = rf.predict(dataframe).tolist()[0] random_forest_confidence_score = round( rf.predict_proba(dataframe)[0][random_forest_pred_score], 2) result = { 'url': url, 'domain': domain, 'model': 'Random Forest', 'prediction_score': random_forest_pred_score, 'confidence_score': random_forest_confidence_score, 'features': features } print(result) metric_logger(url, result) return result
def train_and_predict(): """Train models and make predictions using data prepared by _build_features_""" ROOT_DIR = Path(__file__).resolve().parent.parent.parent DATA_DIR = ROOT_DIR.joinpath("data/") submission = pd.read_csv(f"{DATA_DIR}/raw/sample_submission.csv").pipe( reduce_mem_usage) if "data.csv" not in os.listdir(f"{DATA_DIR}/processed/"): print("Creating dataset...") data = build_features() else: print("Loading dataset...") data = pd.read_csv(f"{DATA_DIR}/processed/data.csv") print("Dataset loaded...") print("Building cross-validator...") day_col = "d" dt_col = "date" cv = prepare_cross_validation(5, int(365 * 1.5), DAYS_PRED, day_col) print("Preparing training and test data...") X_train, y_train, X_test, id_date = prepare_train_test_data(data, day_col) del data gc.collect() print("Train models...") models = train_models(day_col, X_train, y_train, cv) del X_train, y_train gc.collect() print("Make predictions...") preds = make_predictions(X_test, models, cv) print("Create submission...") make_submission(id_date.assign(demand=preds), submission, DATA_DIR) return
def start(self): clean_data.clean() build_features.build_features() cv_scores = train_model.train() self.cv_scores = cv_scores
def dataset_with_features(preprocessed_data): df_with_features = build_features(preprocessed_data, True) return df_with_features
def predict_energy_consumption(buildings): """ Predicts energy consumption with a provided list of buildings. The model is being served as a rest endpoint. :param buildings: List of buildings for which the prediction should be done. :return: Data frame with the predicted readings. """ forecasts = [ forecast_for_building(building) for i, building in buildings.iterrows() ] df = pd.concat(forecasts) df.drop(columns="id", inplace=True) df = buildings.merge(df, left_on="id", right_on="building_id") df["meter"] = 0 df["floor_count"] = df["floorcount"] df["air_temperature"] = df["temp"] df["relative_humidity"] = df["humidity"] df["dew_temperature"] = df["air_temperature"] - ( (100 - df["relative_humidity"]) / 5) df["precip_depth_1_hr"] = np.nan df["timestamp"] = pd.to_datetime(df["date"]) df["wind_direction"] = df["deg"] df["wind_speed"] = df["speed"] df.drop(columns=[ "id", "name", "floorcount", "latitude", "longitude", "user_id", "temp", "feels_like", "temp_min", "temp_max", "pressure", "sea_level", "grnd_level", "humidity", "temp_kf", "main", "description", "icon", "speed", "deg", "date" ], inplace=True) df_temp = df.copy(deep=True) for i in range(1, 4): df_temp["meter"] += 1 df = pd.concat([df, df_temp]) del df_temp cfg = { 'circular_timestamp_encoding': False, 'log_transform_square_feet': True, 'log_transform_area_per_floor': True, 'label_square_feet_outlier': True, 'label_area_per_floor_outlier': True, 'encode_wind_direction': False, 'include_feels_like': True, 'fill_na_with_zero': False, 'add_lag_features': True, 'lag_columns': ['air_temperature', 'dew_temperature', 'cloud_coverage'], 'lag_windows': [6, 24], } [df] = build_features(df, cfg=cfg) df.reset_index(inplace=True, drop=True) building_ids = df["building_id"] timestamps = df["timestamp"] df.drop(columns=[ "timestamp", "month", "wind_direction", "wind_speed", "building_id" ], inplace=True) model_endpoint = "http://model:5001/predict" data = df.to_json() response = requests.get(model_endpoint, json=data).json() predictions = pd.DataFrame({ "reading": response["prediction"], "building_id": building_ids, "meter": df["meter"], "timestamp": timestamps, "air_temperature": df["air_temperature"] }) return predictions