Пример #1
0
def get_negative_samples(spark,
                         use_cache=True,
                         save_to=None,
                         road_limit=None,
                         year_limit=None,
                         year_ratio=None,
                         weather_df=None,
                         sample_ratio=None,
                         accident_df=None):
    """
    Note to self: 539 293 road, 43 848 generated dates,
    nb dates for 1 year : 8760

    year_limit: int or tuple of int
    """
    cache_path = workdir + 'data/negative-samples.parquet'
    if isdir(cache_path) and use_cache and save_to is None:
        return spark.read.parquet(cache_path)
    if save_to is not None:
        cache_path = workdir + save_to
        if isdir(cache_path):
            raise ValueError(f"Directory {save_to} already exists")

    road_df = get_road_df(spark, use_cache)
    road_features_df = \
        get_road_features_df(spark, road_df=road_df, use_cache=use_cache)
    road_df = road_features_df.select('street_id')
    dates_df = generate_dates_df(spark, year_limit, year_ratio)

    if road_limit is not None:
        road_df = road_df.limit(road_limit)

    negative_samples = (dates_df.crossJoin(road_df))

    if sample_ratio is not None:
        negative_samples = negative_samples.sample(sample_ratio)

    negative_samples = \
        negative_samples.withColumn('sample_id',
                                    monotonically_increasing_id())
    accident_df = preprocess_accidents(accident_df or get_accident_df(spark))
    if year_limit is not None:
        accident_df = accident_df.filter(year('date').isin(year_limit))

    weather_df = weather_df or get_weather_df(spark, accident_df)
    negative_samples = negative_samples.join(road_features_df, 'street_id')
    negative_sample_weather = \
        get_weather_information(negative_samples, weather_df)
    negative_samples = \
        negative_samples.join(negative_sample_weather, 'sample_id')
    negative_samples = add_date_features(negative_samples)
    negative_samples = add_solar_features(negative_samples)

    negative_samples = negative_samples.persist()

    if use_cache:
        negative_samples.write.parquet(cache_path)
    return negative_samples
Пример #2
0
def get_positive_samples(spark,
                         road_df=None,
                         weather_df=None,
                         year_limit=None,
                         use_cache=True,
                         limit=None):
    if isinstance(year_limit, int):
        year_limit = [year_limit]
    elif isinstance(year_limit, tuple):
        year_limit = list(year_limit)
    elif not ((year_limit is None) or isinstance(year_limit, list)):
        raise ValueError('Type of year_limit not authorized.')

    cache_path = workdir + 'data/positive-samples.parquet'
    if isdir(cache_path) and use_cache:
        return spark.read.parquet(cache_path)

    road_df = road_df or get_road_df(spark, use_cache)
    accident_df = get_accident_df(spark, use_cache)
    accident_df = preprocess_accidents(accident_df)

    if year_limit is not None:
        accident_df = accident_df.filter(year('date').isin(year_limit))
    if limit is not None:
        accident_df = accident_df.limit(limit)

    weather_df = weather_df or get_weather_df(spark, accident_df)
    road_features_df = \
        (get_road_features_df(spark, road_df=road_df, use_cache=use_cache)
         .drop('loc_lat', 'loc_long'))

    match_acc_road = match_accidents_with_roads(spark, road_df, accident_df)
    print(match_acc_road.head(10))
    accident_df = accident_df.withColumnRenamed('accident_id', 'sample_id')
    accident_weather = get_weather_information(accident_df, weather_df)
    positive_samples = (accident_df.join(
        accident_weather, 'sample_id').withColumnRenamed(
            'sample_id',
            'accident_id').join(match_acc_road, 'accident_id').join(
                road_features_df,
                'street_id').withColumnRenamed('accident_id', 'sample_id'))

    positive_samples = add_date_features(positive_samples)
    positive_samples = add_solar_features(positive_samples)

    #    positive_samples = positive_samples.persist(pyspark.StorageLevel.DISK_ONLY)

    if use_cache:
        positive_samples.write.parquet(cache_path)
    return positive_samples
Пример #3
0
def get_positive_samples(spark,
                         road_df=None,
                         weather_df=None,
                         year_limit=None,
                         use_cache=True,
                         limit=None):
    if isinstance(year_limit, int):
        year_limit = [year_limit]
    elif isinstance(year_limit, tuple):
        year_limit = list(year_limit)
    elif not ((year_limit is None) or isinstance(year_limit, list)):
        raise ValueError("Type of year_limit not authorized.")

    cache_path = workdir + "data/positive-samples.parquet"
    if isdir(cache_path) and use_cache:
        return spark.read.parquet(cache_path)

    road_df = road_df or get_road_df(spark, use_cache)
    accident_df = get_accident_df(spark, use_cache)
    accident_df = preprocess_accidents(accident_df)

    if year_limit is not None:
        accident_df = accident_df.filter(year("date").isin(year_limit))
    if limit is not None:
        accident_df = accident_df.limit(limit)

    weather_df = weather_df or get_weather_df(spark, accident_df)
    road_features_df = get_road_features_df(spark,
                                            road_df=road_df,
                                            use_cache=use_cache).drop(
                                                "loc_lat", "loc_long")
    match_acc_road = match_accidents_with_roads(spark, road_df, accident_df)
    accident_df = accident_df.withColumnRenamed("accident_id", "sample_id")
    accident_weather = get_weather_information(accident_df, weather_df)
    positive_samples = (accident_df.join(
        accident_weather, "sample_id").withColumnRenamed(
            "sample_id",
            "accident_id").join(match_acc_road, "accident_id").join(
                road_features_df,
                "street_id").withColumnRenamed("accident_id", "sample_id"))

    positive_samples = add_date_features(positive_samples)
    positive_samples = add_solar_features(positive_samples)
    positive_samples = positive_samples.persist()

    if use_cache:
        positive_samples.write.parquet(cache_path)
    return positive_samples