예제 #1
0
def main(targets):

    if 'test-project' in targets:
        cfg = load_params(TEST_PARAMS)
        prep_dir(**cfg)
        get_data(**cfg)
        build_features(**cfg)
        reduce_apis()
        node2vec_main(**cfg)
        word2vec_main(**cfg)
        metapath2vec_main(**cfg)
        return

    if 'data' in targets:
        cfg = load_params(DATA_PARAMS)
    elif 'data-test' in targets:
        cfg = load_params(TEST_PARAMS)
    else:
        return

    prep_dir(**cfg)

    if 'clean' in targets:
        clean_raw(**cfg)
        clean_features(**cfg)
        clean_processed(**cfg)
        return

    if 'ingest' in targets:
        get_data(**cfg)

    if 'process' in targets:
        build_features(**cfg)

    if 'reduce' in targets:
        reduce_apis()

    if 'node2vec' in targets:
        node2vec_main(**cfg)

    if 'word2vec' in targets:
        word2vec_main(**cfg)

    if 'model' in targets:
        hindroid.run(**cfg)

    return
예제 #2
0
    def parameterized_test(self, model, mode):
        # given:
        data_dir = "test-data"
        interim_dir = self.test_dir + "/interim"
        processed_dir = self.test_dir + "/processed"
        model_dir = self.test_dir + "/model"
        model_path = model_dir + ("" if mode == "full" else "_" +
                                  mode) + "/0001.txt"
        submission_dir = self.test_dir + "/submissions"
        submission_path = submission_dir + "/submission.csv"

        # data preparation
        # when:
        make_dataset(data_dir, interim_dir)

        # then:
        self.assertTrue(os.path.exists(interim_dir + "/test_data.pkl"))
        self.assertTrue(os.path.exists(interim_dir + "/test_data.pkl"))

        # feature engineering
        # when:
        build_features(data_dir, processed_dir)

        # then:
        self.assertTrue(os.path.exists(processed_dir + "/test_data.pkl"))
        self.assertTrue(os.path.exists(processed_dir + "/test_data.pkl"))

        # model training
        # when:
        train_model(model, mode, processed_dir, model_dir)

        # then:
        self.assertTrue(os.path.exists(model_path))

        # model prediction
        # when:
        predict_model(processed_dir, model, model_path, submission_path)

        # then:
        self.assertTrue(os.path.exists(submission_path))
예제 #3
0
def main(targets):
    # make the data target
    if 'data' in targets:
        cfg = load_params(DATA_PARAMS)
    elif 'data-test' in targets:
        cfg = load_params(TEST_PARAMS)
    elif 'test-project' in targets:
        cfg = load_params(TEST_PARAMS)
        prep_dir(**cfg)
        clean_raw(**cfg)
        clean_features(**cfg)
        clean_processed(**cfg)
        prep_dir(**cfg)
        get_data(**cfg)
        build_features(**cfg)
        hindroid.run(**cfg)
        return
    else:
        return

    prep_dir(**cfg)

    # make the clean target
    if 'clean' in targets:
        clean_raw(**cfg)
        clean_features(**cfg)
        clean_processed(**cfg)
        return

    # make the data target
    if 'ingest' in targets:
        get_data(**cfg)

    if 'process' in targets:
        build_features(**cfg)

    if 'model' in targets:
        hindroid.run(**cfg)

    return
예제 #4
0
    def train(self, stock_df, label='label', test_percent=0.2):

        # data cleansing
        clean_stock_df = stock_df.dropna(inplace=False)

        # deriving features and outcome from the data to be trained upon
        self.x = build_features(clean_stock_df, label)
        self.y = outcome(clean_stock_df, label)

        # split data into training and test data set
        x_train, x_test, y_train, y_test = train_test_split(
            self.x, self.y, test_size=test_percent)

        # choosing the classifier and fit the parameters for the training dataset
        self.classifier.fit(x_train, y_train)

        # now we calculate the accuracy of our linear model
        print(f'Accuracy: {self.classifier.score(x_test, y_test)}')
예제 #5
0
def builder():
    """
    Will be called to create all the models, after processing the dataset and building the features, which will then be accessed during usage of api.py
    """

    print("making dataset...")
    logger.info("making dataset...")
    df = make_dataset()

    print("build features...")
    logger.info("build features...")
    processed_data_with_features = build_features(df, True)

    print("train model...")
    logger.info("train model...")
    execute_models(processed_data_with_features)

    print("done")
    logger.info("done")
예제 #6
0
def predict_model(url):
    """
    Retuns a list containing the result of the prediction
    The list shows the url, model, precition score, and confidence score.

    Args:
        url: A string

    Retuns:
        A list with 4 keys: url, model, prediction_score, confidence_score
    """

    df = ['' + url]
    df = pd.DataFrame(df, columns=['url'])

    dataframe = build_features(df, False)
    domain = dataframe['domain'][0]
    features = dataframe.to_dict(orient="records")[0]
    dataframe = model_utils.load_data(dataframe)

    # Find the prediction score and condience score of the model
    random_forest_pred_score = rf.predict(dataframe).tolist()[0]
    random_forest_confidence_score = round(
        rf.predict_proba(dataframe)[0][random_forest_pred_score], 2)

    result = {
        'url': url,
        'domain': domain,
        'model': 'Random Forest',
        'prediction_score': random_forest_pred_score,
        'confidence_score': random_forest_confidence_score,
        'features': features
    }
    print(result)

    metric_logger(url, result)

    return result
def train_and_predict():
    """Train models and make predictions using data prepared by _build_features_"""
    ROOT_DIR = Path(__file__).resolve().parent.parent.parent
    DATA_DIR = ROOT_DIR.joinpath("data/")

    submission = pd.read_csv(f"{DATA_DIR}/raw/sample_submission.csv").pipe(
        reduce_mem_usage)
    if "data.csv" not in os.listdir(f"{DATA_DIR}/processed/"):
        print("Creating dataset...")
        data = build_features()
    else:
        print("Loading dataset...")
        data = pd.read_csv(f"{DATA_DIR}/processed/data.csv")
    print("Dataset loaded...")

    print("Building cross-validator...")
    day_col = "d"
    dt_col = "date"
    cv = prepare_cross_validation(5, int(365 * 1.5), DAYS_PRED, day_col)

    print("Preparing training and test data...")
    X_train, y_train, X_test, id_date = prepare_train_test_data(data, day_col)
    del data
    gc.collect()

    print("Train models...")
    models = train_models(day_col, X_train, y_train, cv)
    del X_train, y_train
    gc.collect()

    print("Make predictions...")
    preds = make_predictions(X_test, models, cv)

    print("Create submission...")
    make_submission(id_date.assign(demand=preds), submission, DATA_DIR)

    return
 def start(self):
     clean_data.clean()
     build_features.build_features()
     cv_scores = train_model.train()
     self.cv_scores = cv_scores
예제 #9
0
def dataset_with_features(preprocessed_data):
    df_with_features = build_features(preprocessed_data, True)
    return df_with_features
예제 #10
0
def predict_energy_consumption(buildings):
    """
    Predicts energy consumption with a provided list of buildings.
    The model is being served as a rest endpoint.
    :param buildings: List of buildings for which the prediction should be done.
    :return: Data frame with the predicted readings.
    """
    forecasts = [
        forecast_for_building(building)
        for i, building in buildings.iterrows()
    ]
    df = pd.concat(forecasts)
    df.drop(columns="id", inplace=True)
    df = buildings.merge(df, left_on="id", right_on="building_id")
    df["meter"] = 0
    df["floor_count"] = df["floorcount"]
    df["air_temperature"] = df["temp"]
    df["relative_humidity"] = df["humidity"]
    df["dew_temperature"] = df["air_temperature"] - (
        (100 - df["relative_humidity"]) / 5)
    df["precip_depth_1_hr"] = np.nan
    df["timestamp"] = pd.to_datetime(df["date"])
    df["wind_direction"] = df["deg"]
    df["wind_speed"] = df["speed"]

    df.drop(columns=[
        "id", "name", "floorcount", "latitude", "longitude", "user_id", "temp",
        "feels_like", "temp_min", "temp_max", "pressure", "sea_level",
        "grnd_level", "humidity", "temp_kf", "main", "description", "icon",
        "speed", "deg", "date"
    ],
            inplace=True)

    df_temp = df.copy(deep=True)
    for i in range(1, 4):
        df_temp["meter"] += 1
        df = pd.concat([df, df_temp])
    del df_temp

    cfg = {
        'circular_timestamp_encoding': False,
        'log_transform_square_feet': True,
        'log_transform_area_per_floor': True,
        'label_square_feet_outlier': True,
        'label_area_per_floor_outlier': True,
        'encode_wind_direction': False,
        'include_feels_like': True,
        'fill_na_with_zero': False,
        'add_lag_features': True,
        'lag_columns':
        ['air_temperature', 'dew_temperature', 'cloud_coverage'],
        'lag_windows': [6, 24],
    }
    [df] = build_features(df, cfg=cfg)

    df.reset_index(inplace=True, drop=True)
    building_ids = df["building_id"]
    timestamps = df["timestamp"]
    df.drop(columns=[
        "timestamp", "month", "wind_direction", "wind_speed", "building_id"
    ],
            inplace=True)

    model_endpoint = "http://model:5001/predict"
    data = df.to_json()
    response = requests.get(model_endpoint, json=data).json()

    predictions = pd.DataFrame({
        "reading": response["prediction"],
        "building_id": building_ids,
        "meter": df["meter"],
        "timestamp": timestamps,
        "air_temperature": df["air_temperature"]
    })
    return predictions