Exemplo n.º 1
0
    def __init__(self, args, args2):
        """
        Spark version for normalizing data, including minmax and zscore
        @param args: dict
        columns: list
        """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.originalDF = None
        self.transformedDF = None
        self.parameterDF = None
        self.inputUrl1 = args["input"][0]["value"]
        try:
            self.inputUrl2 = args["input"][1]["value"]
        except IndexError:
            self.inputUrl2 = ""
        self.outputUrl1 = args["output"][0]["value"]
        self.outputUrl2 = args["output"][1]["value"]
        try:
            self.columns = args["param"]["columns"]
        except KeyError as e:
            self.columns = None
        self.param = args["param"]
        self.logger.info("initializing SparkSession")

        self.spark = utils.init_spark()
def generate_match_accident_road_of_one_month(year, month):
    filepath = workdir + f'data/match_accident-road_{year}-{month}.parquet'
    if isdir(filepath):  # Skip if already done
        return
    print(f'Generating {year}-{month}')
    spark = init_spark()
    road_df = get_road_df(spark, use_cache=True)
    accident_df = preprocess_accidents(get_accident_df(spark))

    start_day_str = f'{year}-{month:02}-01'
    if month == 12:
        end_year = year + 1
        month = 0
    else:
        end_year = year
    end_day_str = f'{end_year}-{(month + 1):02}-01'

    start_day = datetime.datetime.fromisoformat(start_day_str)
    end_day = datetime.datetime.fromisoformat(end_day_str)
    accident_df = (accident_df.filter((col('date') >= start_day)
                                      & (col('date') < end_day)))

    match_accident_road = \
        match_accidents_with_roads(spark, road_df, accident_df,
                                   use_cache=False)

    match_accident_road.write.parquet(filepath)
    spark.stop()  # Force garbage collection and empty temp dir
Exemplo n.º 3
0
    def getIn(self):
        # Spark模型加载
        self.logger.debug("using PySpark")
        from pyspark.ml import PipelineModel
        self.logger.info("initialize SparkSession")
        self.spark = utils.init_spark()

        self.originalDF = utils.dataUtil.SparkReadHive(self.inputUrl2, self.spark)
        self.model = PipelineModel.load(self.inputUrl1)
Exemplo n.º 4
0
    def __init__(self, args, args2):
        """
        Spark version for reading data from HDFS and then write to HIVE
        @param args: dict
        inputUrl: String
        outputUrl: String
        """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.outputUrl1 = args["output"][0]["value"]

        self.type = args["param"]["type"]
        self.inputUrl1 = args["param"]["path"]
        self.DF = None

        self.logger.info("initializing SparkSession")
        self.spark = utils.init_spark()
Exemplo n.º 5
0
    def __init__(self, args, args2):
        """
        Spark version for initializing RandomForest binary classifier
        @param args: dict
        n_estimators: int
        criterion: string one of "gini" and "entropy"
        max_depth: int
        min_samples_split: int
        min_samples_leaf: int
        """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.outputUrl1 = args["output"][0]["value"]
        self.param = args["param"]
        self.model = None

        self.logger.info("initializing spark")
        self.spark = utils.init_spark()
Exemplo n.º 6
0
    def __init__(self, args, args2):
        """
        Standalone version for type transformation
        @param args: dict
        toDoubleColumns: list
        defaultDoubleValue: double
        toIntColumns: list
        defaultIntValue: int
        toCategoricalColumns: list
        """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.originalDF = None
        self.transformedDF = None
        self.paramDF = None

        self.inputUrl1 = args["input"][0]["value"]
        self.outputUrl1 = args["output"][0]["value"]
        self.outputUrl2 = self.outputUrl1 + "toCategorical"
        self.param = args["param"]
        try:
            self.toDoubleColumns = self.param["toDouble"]
        except KeyError:
            self.toDoubleColumns = None
        try:
            self.defaultDoubleValue = self.param["defaultDoubleValue"]
        except KeyError:
            self.defaultDoubleValue = 0.0
        try:
            self.toIntColumns = self.param["toInt"]
        except KeyError:
            self.toIntColumns = None
        try:
            self.defaultIntValue = self.param["defaultIntValue"]
        except KeyError:
            self.defaultIntValue = 0
        try:
            self.toCategoricalColumns = self.param["toCategoricalColumns"]
        except KeyError:
            self.toCategoricalColumns = None

        self.mode = self.param["mode"]

        self.logger.info("initializing SparkSession")
        self.spark = utils.init_spark()
Exemplo n.º 7
0
    def __init__(self, args, args2):
        """
        Spark version for training clustering model
        @param args: dict
        featureCols: list
        """
        self.logger = logging.getLogger(self.__class__.__name__)

        self.originalDF = None
        self.transformDF = None
        self.inputUrl1 = args["input"][0]["value"]
        self.inputUrl2 = args["input"][1]["value"]
        self.outputUrl1 = args["output"][0]["value"]
        self.outputUrl2 = args["output"][1]["value"]
        self.featureCols = args["param"]["features"]
        self.model = None
        self.pipelineModel = None
        self.logger.debug("initializing SparkSession")
        self.spark = utils.init_spark()
Exemplo n.º 8
0
    def __init__(self, args, args2):
        """
        Spark version for evaluating binary classifier
        @param args: dict
        featureCols: list
        labelCol: string
        """
        self.logger = logging.getLogger(self.__class__.__name__)

        self.inputUrl1 = args["input"][0]["value"]
        self.inputUrl2 = args["input"][1]["value"]
        self.outputUrl1 = args["output"][0]["value"]
        self.featureCols = args["param"]["features"]

        self.originalDF = None
        self.transformDF = None
        self.model = None
        self.result = None

        self.logger.debug("initializing SparkSession")
        self.spark = utils.init_spark()
Exemplo n.º 9
0
    def __init__(self, args, args2):
        """
        Standalone version for initializing KMeans clustering
        @param args: dict
        K: int
        init: string one of "k-means++" and "random"
        n_init: int
        max_iter: int
        tol: float
        """
        # init logging
        self.logger = logging.getLogger(self.__class__.__name__)

        # init parameters
        self.outputUrl1 = args["output"][0]["value"]
        self.param = args["param"]
        self.model = None

        self.logger.info("initializing SparkSession")
        # init SparkSession
        self.spark = utils.init_spark()
Exemplo n.º 10
0
    def __init__(self, args, args2):
        """
        Spark version for conducting PCA on input dataset
        @param args: dict
        inputUrl: String
        outputUrl: String
        columns: list
        k: int
        """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.originalDF = None
        self.transformDF = None
        self.pipelineModel = None

        self.inputUrl1 = args["input"][0]["value"]
        self.outputUrl1 = args["output"][0]["value"]

        self.columns = args["param"]["columns"]
        self.k = args["param"]["k"]

        self.logger.info("initializing SparkSession")
        self.spark = utils.init_spark()
Exemplo n.º 11
0
    def __init__(self, args, args2):
        """
         Standalone version for split data, including byRatio and byThreshold
         @param args: dict
         splitBy: string one of byRation and byThreshold
         ratio: double
         thresholdColumn: stirng
         threshold: double
         """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.originalDF = None
        self.DF1 = None
        self.DF2 = None

        self.inputUrl1 = args["input"][0]["value"]
        self.outputUrl1 = args["output"][0]["value"]
        self.outputUrl2 = args["output"][1]["value"]

        self.param = args["param"]

        self.logger.info("initializing SparkSession")

        self.spark = utils.init_spark()
Exemplo n.º 12
0
#!/usr/bin/env python
import accident_prediction_montreal
from pyspark.sql.functions import udf, min, max, col
from pyspark.sql.types import FloatType
from pyspark.ml.feature import VectorAssembler
from preprocess import get_negative_samples, get_positive_samples
from utils import init_spark
from preprocess import get_dataset_df
from export_results import *

result_dir = create_result_dir('base')
spark = init_spark()
neg_samples = get_negative_samples(spark).sample(0.5)
pos_samples = get_positive_samples(spark)

imbalance_ratio = (neg_samples.count() / pos_samples.count())

train_set, test_set = get_dataset_df(spark, pos_samples, neg_samples)
train_set, test_set = train_set.persist(), test_set.persist()

get_accidents_count = udf(lambda v: float(v[7]), FloatType())


def fit(train_set):
    accidents_count = train_set.select(
        get_accidents_count('features').alias('accidents_count'), 'label')
    accidents_count_to_proba = []
    for i in range(377):
        accidents_count_higher = \
                accidents_count.filter(col('accidents_count') >= i)
        proba = (accidents_count_higher.filter(col('label') == 1.0).count() /