Exemplo n.º 1
0
def main(argv):
    spark = SparkSession.builder \
        .master("local[*]") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "1g") \
        .getOrCreate()

    phase = Phase.predict

    data_to_predict_path = f'data/processed/{phase.name}/features'
    model_path = 'model/trip_duration_min'
    predicted_data_path = 'data/reporting/trip_durations'

    model = PipelineModel.load(model_path)

    predicted_df = PipelineModel([
        model,
        DropColumns(
            inputCols=['features', 'pickup_cell_6_idx', 'dropoff_cell_6_idx']),
        SaveToParquet(predicted_data_path)
    ]).transform(ParquetDataFrame(data_to_predict_path, spark))

    predicted_df.show(2)
    spark.stop()