Exemplo n.º 1
0
def analyze(spark: SparkSession,
            file='/ships_data/PortSubset-small.csv',
            output_file='ais_data.parquet'):
    if not output_file:  # No path, use tmp
        output_file = tempfile.mkdtemp() + '/ais_data.parquet'
    prt_high("""
        Running csv ingestion
        ##################################
        Parameters
         - Input file:  {}
         - Output file: {}
        ##################################
        """.format(file, output_file))

    ships_table = spark.read.format('com.databricks.spark.csv')\
        .options(header='true', inferschema='false', delimiter=';').load(file)
    ships_table = clean_data(ships_table)
    ships_table = cast_data(ships_table)
    ships_table = ships_table.na.drop()
    ships_table = ensure_columns_type(ships_table)

    # New features

    ships_table = ships_table.withColumn('ais_navstatus', (when(
        col("sog") < 1, 'HOT').when(col("sog") <= 5, 'MAN').otherwise('CRU')))

    ships_table.write.mode('overwrite').parquet(output_file)

    # If we have mlflow, log the result
    if 'mlflow' in sys.modules:
        prt_high("Logging MLFlow artifacts")
        mlflow.log_artifacts(output_file, "ais_data.parquet")

    return
Exemplo n.º 2
0
def main(args):
    prt_info("Going to run Spark Job")
    # Change path to the current file's path
    abspath = os.path.abspath(__file__)
    dname = os.path.dirname(abspath)
    os.chdir(dname)

    # environment = {
    #     'PYSPARK_JOB_ARGS': ' '.join(args.job_args) if args.job_args else ''
    # }

    job_args = dict()
    if args.job_args:
        job_args_tuples = [arg_str.split('=') for arg_str in args.job_args]
        prt_info('job_args_tuples: %s' % job_args_tuples)
        job_args = {a[0]: a[1] for a in job_args_tuples}

    prt_info('\nRunning job %s...\nenvironment is %s\n'
             % (args.job_name, str(job_args)))

    # Start Spark
    spark = SparkSession.builder\
        .appName(args.job_name)\
        .config("spark.jars", args.extra_jars)

    if args.hdfs:
        spark = spark.config("spark.hadoop.fs.defaultFS", args.hdfs)

    spark = spark.getOrCreate()

    # Set timezone to UTC
    spark.conf.set("spark.sql.session.timeZone", "UTC")

    prt_warn("-> For some reason we get the spark SQLContext deprecated\
             warning, but we're already using SparkSession. Ignore.")
    prt_info("Spark context started. Going to import jobs.")
    prt_info("Setting log level to {}".format(args.log_level))
    spark.sparkContext.setLogLevel(args.log_level)

    # Import Module
    try:
        job_module = importlib.import_module('jobs.%s' % args.job_name)
    except Exception as e:
        prt_err(str(e))
        prt_err("Error, couldnt load module %s" % args.job_name)
        exit(1)

    # Execute Module
    start = time.time()
    job_module.analyze(spark, **job_args)
    end = time.time()
    prt_high("\nExecution of job %s took %s seconds"
             % (args.job_name, end-start))
Exemplo n.º 3
0
def log_emission_summary(emis):
    # Add ME and AE
    emis = emis\
            .withColumn('trans_p', col('trans_p_me') + col('trans_p_ae'))\
            .withColumn('nox', col('nox_me') + col('nox_ae'))\
            .withColumn('sox', col('sox_me') + col('sox_ae'))\
            .withColumn('co2', col('co2_me') + col('co2_ae'))

    # Log total
    all = emis.agg(
                sum(col('trans_p')).alias('total_trans_p'),
                sum(col('trans_p_me')).alias('total_trans_p_me'),
                sum(col('trans_p_ae')).alias('total_trans_p_ae'),
                sum(col('nox')).alias('total_nox'),
                sum(col('sox')).alias('total_sox'),
                sum(col('co2')).alias('total_co2'),
                sum(col('nox_me')).alias('total_nox_me'),
                sum(col('sox_me')).alias('total_sox_me'),
                sum(col('co2_ae')).alias('total_co2_me'),
                sum(col('nox_ae')).alias('total_nox_ae'),
                sum(col('sox_ae')).alias('total_sox_ae'),
                sum(col('co2_ae')).alias('total_co2_ae')).toPandas()
    mlflow.log_metrics(all.iloc[0].to_dict())

    # Generate time features
    emis = emis.withColumn('time', emis.time.cast(dataType=TimestampType()))
    emis = emis\
        .withColumn('day', dayofyear(col('time')))\
        .withColumn('week', weekofyear(col('time')))\
        .withColumn('month', month(col('time')))\
        .withColumn('dayofweek', dayofweek(col('time'))).cache()

    day_df = group_emis(emis, 'day')
    week_df = group_emis(emis, 'week')
    month_df = group_emis(emis, 'month')
    dayofweek_df = group_emis(emis, 'dayofweek')

    prt_high("Generated summary. Logging it.")
    # Log everything
    log_dataframe_metric(day_df, 'day_day')
    log_dataframe_metric(week_df, 'week_week')
    log_dataframe_metric(month_df, 'month_month')
    log_dataframe_metric(dayofweek_df, 'dayofweek_dayofweek')
Exemplo n.º 4
0
def analyze(spark: SparkSession, input_file='emissions.parquet', hdfs_path='hdfs://', plot_path='../output'):
    prt_high(
            """
            Running Summarize Emissions
            ##################################
            Parameters
             - Input file: {}
             - Output HDFS path: {}
             - Output plot path: {}
            ##################################
            """.format(input_file, hdfs_path, plot_path)
            )

    # import os
    # os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"

    emis = spark.read.parquet(input_file)
    #log_emission_summary(emis)
    log_emission_summary_csv(emis, hdfs_path, plot_path)

    return
Exemplo n.º 5
0
def analyze(spark: SparkSession, file='/ships_data/IHSData.txt',
            output_file='ihs_metadata.parquet'):
    if not output_file:  # No path, use tmp
        output_file = tempfile.mkdtemp() + '/ihs_metadata.parquet'
    prt_high(
        """
        Running metadata ingestion
        ##################################
        Parameters
         - Input file:  {}
         - Output file: {}
        ##################################
        """.format(file, output_file))

    ihs_table = spark.read.csv(file, header=True, schema=schema, sep=',',
                               nullValue='NA')
    ihs_table = clean_data(ihs_table)
    ihs_table.write.mode('overwrite').parquet(output_file)

    if 'mlflow' in sys.modules:
        prt_high("Logging MLFlow artifacts")
        mlflow.log_artifacts(output_file, "ihs_data.parquet")

    return
Exemplo n.º 6
0
def analyze(spark: SparkSession,
            input_file='emissions.parquet',
            output_file='emissions.csv'):
    prt_high("""
            Running export to CSV
            ##################################
            Parameters
             - Input file: {}
             - Output file: {}
            ##################################
            """.format(input_file, output_file))

    df = spark.read.parquet(input_file)

    df.orderBy('time', 'imo')\
        .coalesce(1)\
        .write.csv(output_file, header=True, mode='overwrite')
    # Coalesce(1) so that we write one big csv.

    # Pandas version
    # pdf = df.orderBy('time', 'imo').toPandas()
    # pdf.to_csv(output_file)

    return
Exemplo n.º 7
0
def analyze(spark: SparkSession, input_file='/ships_data/IHSData.txt',
            hermes_file='/ships_data/day_summary.csv', model='',
            output_file='hermes_comparison.csv'):
    if not output_file:  # No path, use tmp
        output_file = tempfile.mkdtemp() + '/hermes_comparison.csv'
    prt_high(
        """
        Running comparison with HERMES
        ##################################
        Parameters
         - Input file:  {}
         - HERMES file:  {}
         - Model: {}
         - Output file: {}
        ##################################
        """.format(input_file, hermes_file, model, output_file))

    emis = spark.read.parquet(input_file)
    summ = emis_summary(emis)
    hermes = process_hermes(hermes_file)
    diff = emis_diff(summ, hermes, model)
    diff.to_csv(output_file)

    if 'mlflow' in sys.modules:
        prt_high("Logging MLFlow artifacts")
        mlflow.log_artifact(output_file, "hermes_comparison.csv")
        log_diff_by_type(diff)

        pols = ["NOx", "SOx", "CO2"]
        models = ["hermes", model]
        piv = pivot_diff_df(diff, models, pols)
        types = diff.type.unique()
        pol_barplot_by_type(piv, types, pols)
        mlflow.log_artifact("barplot")

    return
Exemplo n.º 8
0
def analyze(spark: SparkSession,
            input_file='rasters.parquet',
            output_file='/home/rasters.hdf5'):
    prt_high("""
            ##################################
            Parameters
             - Input file: {}
             - Output file: {}
            ##################################
            """.format(input_file, output_file))

    df = spark.read.parquet(input_file)

    # Metadata processing
    metadf = spark.read.parquet(input_file + ".meta").collect()[0]
    num_vars = metadf['num_vars']
    num_cols = metadf['num_cols']
    num_rows = metadf['num_rows']

    # 'last_amp_v', 'last_cos', 'last_sin')\
    by_time = df.select('hour', 'cell', 'sample_count', 'nox_me', 'nox_ae')\
        .rdd.map((lambda row: (row['hour'], row))).groupByKey()\
        .sortByKey(ascending=True).cache()

    n_rasters = by_time.count()

    filename = output_file
    f = h5py.File(filename, 'w', libver='latest')
    shape = (n_rasters, num_rows, num_cols, num_vars)
    dset = f.create_dataset('dataset',
                            shape,
                            dtype='f',
                            compression="gzip",
                            chunks=(1, num_rows, num_cols, num_vars))

    for i, raster_tuple in enumerate(by_time.toLocalIterator()):
        dset[i] = to_raster(raster_tuple[1], num_rows, num_cols, num_vars)

    # Global aggregates
    # Aggregates by cell
    ds_min = np.min(dset, axis=(0))
    prt_info('min:', ds_min)
    ds_max = np.max(dset, axis=(0))
    prt_info('max:', ds_max)
    ds_mean = np.mean(dset, axis=(0))
    prt_info('mean:', ds_mean)
    ds_std = np.std(dset, axis=(0))
    prt_info('std:', ds_std)

    f.create_dataset('min', data=ds_min)
    f.create_dataset('max', data=ds_max)
    f.create_dataset('mean', data=ds_mean)
    f.create_dataset('std', data=ds_std)

    prt_info("Cell metadata shape:")
    prt_info(f['min'].shape)
    prt_info(f['max'].shape)
    prt_info(f['mean'].shape)
    prt_info(f['std'].shape)

    # Global aggregates
    ds_min = np.min(ds_min, axis=(0, 1))
    # We get min and max from the previous result!
    prt_info('min:', ds_min)

    ds_max = np.max(ds_max, axis=(0, 1))
    prt_info('max:', ds_max)

    ds_mean = np.mean(dset, axis=(0, 1, 2))
    prt_info('mean:', ds_mean)

    ds_std = np.std(dset, axis=(0, 1, 2))
    prt_info('std:', ds_std)

    dset.attrs['min'] = ds_min
    dset.attrs['max'] = ds_max
    dset.attrs['mean'] = ds_mean
    dset.attrs['std'] = ds_std
    f.close()
Exemplo n.º 9
0
def analyze(spark: SparkSession, input_data='ships_data.parquet',
            input_metadata='ships_metadata.parquet',
            output_file='emissions.parquet',
            step=60, interpolation_lim=15*60, unit="kg",
            sfoc="NAEI", model="STEAM", ae_on_lim=24*60*60):
    if not output_file:  # No path, use tmp
        output_file = tempfile.mkdtemp() + '/emissions.parquet'
    prt_high("""
    Running compute emissions
    ##################################
    Parameters
     - Input data: {}
     - Input metadata (IHS): {}
     - Output file: {}
     - Interpolation limit: {} (s)
     - Interpolation step: {} (s)
     - Aux. Eng. at berth limit: {} (s)
     - Unit: {}/{}s
     - SFOC: {}
     - Emission Model: {}
    ##################################
    """.format(input_data, input_metadata, output_file, interpolation_lim,
               step, ae_on_lim, unit, step, sfoc, model))
    # Rename stage
    if 'mlflow' in sys.modules:
        mlflow.set_tag(
            "mlflow.runName", "compute_emissions_{}_{}".format(model, sfoc))

    # Cast parameters
    interpolation_lim = int(interpolation_lim)
    ae_on_lim = int(ae_on_lim)
    step = int(step)

    if unit == "kg":
        prt_info("Setting unit to kilograms")
        unit = 1000
    else:
        prt_info("Setting unit to grams")
        unit = 1

    df = spark.read.parquet(input_data)
    ihs_df = spark.read.parquet(input_metadata)

    # IHS processing

    # Filter desired SFOC
    if sfoc == "NAEI":
        print("Using NAEI SFOC estimation")
        ihs_df = ihs_df.withColumnRenamed("naei_sfoc_me", "sfoc_me")\
            .withColumnRenamed("naei_sfoc_ae", "sfoc_ae")
    else:
        # Other is STEAM by default
        print("Using STEAM SFOC estimation")
        ihs_df = ihs_df.withColumnRenamed("steam_sfocbase_me", "sfoc_me")\
            .withColumnRenamed("steam_sfocbase_ae", "sfoc_ae")

    # Variable filtering and function preparation
    if model == "STEAM2":
        transientPowerMEFunc = udf(transient_power_me_steam2, FloatType())
        transientPowerAEFunc = udf(transient_power_ae_steam2, FloatType())

        ihs_df = ihs_df.select('imo',
                               'l',
                               'b',
                               't',
                               'qpc',
                               'wet_surf_k',
                               'wet_surf_a3',
                               'cr_nofn',
                               'n_screw',
                               'n_cabin',
                               'n_ref_teu',
                               'design_draft',
                               'waterline',
                               'type',
                               'hermes_type',
                               'me_rpm',
                               'ae_rpm',
                               'inst_pow_me',
                               'inst_pow_ae',
                               'design_speed',
                               'sfoc_me',
                               'sfoc_ae'
                               ).cache()
        # Which model is used? If there any nulls on the selected attrs
        # we use STEAM (except for inst_pow)
        ihs_df = count_nulls_steam2(ihs_df)\
            .withColumn("model", (col("nulls") == 0).cast('integer')+1)

    else:   # Default is STEAM
        transientPowerMEFunc = udf(transient_power_me_steam, FloatType())
        transientPowerAEFunc = udf(transient_power_ae_steam, FloatType())

        ihs_df = ihs_df.select('imo',
                               'type',
                               'hermes_type',
                               'me_rpm',
                               'ae_rpm',
                               'inst_pow_me',
                               'inst_pow_ae',
                               'design_speed',
                               'sfoc_me',
                               'sfoc_ae'
                               ).cache()

    # Dataset joining and SFOC selection
    joined = df.select('nombre', 'imo', 'sog', 'latitude', 'longitude',
                       'time')\
               .join(ihs_df, ['imo'], 'inner')

    # Interpolation
    grouped = joined.rdd.groupBy(lambda record: record['imo'])
    interpolated = grouped.flatMap(
        lambda d: transform_grouped(d, step, interpolation_lim))
    # new_df = interpolated.toDF()

    if model == "STEAM2":
        interp_schema = StructType([
            StructField('imo',          LongType(),     True),
            StructField('nombre',       StringType(),   True),
            StructField('sog',          DoubleType(),   True),
            StructField('latitude',     DoubleType(),   True),
            StructField('longitude',    DoubleType(),   True),
            StructField('time',         LongType(),     True),
            StructField('l',            DoubleType(),   True),
            StructField('b',            DoubleType(),   True),
            StructField('t',            DoubleType(),   True),
            StructField('qpc',          DoubleType(),   True),
            StructField('wet_surf_k',   DoubleType(),   True),
            StructField('wet_surf_a3',  DoubleType(),   True),
            StructField('cr_nofn',      DoubleType(),   True),
            StructField('n_screw',      LongType(),     True),
            StructField('n_cabin',      LongType(),     True),
            StructField('n_ref_teu',    LongType(),     True),
            StructField('design_draft', BooleanType(),  True),
            StructField('waterline',    DoubleType(),   True),
            StructField('type',         StringType(),   True),
            StructField('hermes_type',  StringType(),   True),
            StructField('me_rpm',       LongType(),     True),
            StructField('ae_rpm',       LongType(),     True),
            StructField('inst_pow_me',  DoubleType(),   True),
            StructField('inst_pow_ae',  DoubleType(),   True),
            StructField('design_speed', DoubleType(),   True),
            StructField('sfoc_me',      LongType(),     True),
            StructField('sfoc_ae',      LongType(),     True),
            StructField('nulls',        LongType(),     True),
            StructField('model',        LongType(),     True),
            StructField('last_move',    LongType(),     True),
            StructField('d_lat',        DoubleType(),   True),
            StructField('d_lon',        DoubleType(),   True),
            StructField('amp_v',        DoubleType(),   True)
        ])
    else:
        # Steam 1
        interp_schema = StructType([
            StructField('imo',          LongType(),     True),
            StructField('nombre',       StringType(),   True),
            StructField('sog',          DoubleType(),   True),
            StructField('latitude',     DoubleType(),   True),
            StructField('longitude',    DoubleType(),   True),
            StructField('time',         LongType(),     True),
            StructField('type',         StringType(),   True),
            StructField('hermes_type',  StringType(),   True),
            StructField('me_rpm',       LongType(),     True),
            StructField('ae_rpm',       LongType(),     True),
            StructField('inst_pow_me',  DoubleType(),   True),
            StructField('inst_pow_ae',  DoubleType(),   True),
            StructField('design_speed', DoubleType(),   True),
            StructField('sfoc_me',      LongType(),     True),
            StructField('sfoc_ae',      LongType(),     True),
            StructField('last_move',    LongType(),     True),
            StructField('d_lat',        DoubleType(),   True),
            StructField('d_lon',        DoubleType(),   True),
            StructField('amp_v',        DoubleType(),   True)
        ])

    new_df = spark.createDataFrame(data=interpolated, schema=interp_schema)

    # Setting the schema for the new data
    # new_df = change_column_type(new_df, 'time', IntegerType(), True)
    # new_df = change_column_type(new_df, 'latitude', FloatType(), True)
    # new_df = change_column_type(new_df, 'longitude', FloatType(), True)
    # new_df = change_column_type(new_df, 'sog', FloatType(), True)
    # new_df = change_column_type(new_df, 'imo', IntegerType(), True)
    # new_df = change_column_type(new_df, 'd_lat', FloatType(), True)
    # new_df = change_column_type(new_df, 'd_lon', FloatType(), True)
    # new_df = change_column_type(new_df, 'amp_v', FloatType(), True)

    if model == "STEAM2":
        # Transient power calculation
        new_df = new_df.withColumn(
            'trans_p_me', transientPowerMEFunc(
                new_df['model'], new_df['sog'], new_df['design_speed'],
                new_df['inst_pow_me'], new_df['l'], new_df['b'], new_df['t'],
                new_df['qpc'], new_df['wet_surf_k'], new_df['wet_surf_a3'],
                new_df['cr_nofn'], new_df['n_screw'], new_df['design_draft'],
                new_df['waterline'])
        )
        new_df = new_df.withColumn(
            'trans_p_ae', transientPowerAEFunc(
                new_df['sog'], new_df['type'], new_df['inst_pow_ae'],
                new_df['n_cabin'], new_df['n_ref_teu'])
        )
    else:
        # Transient power calculation
        new_df = new_df.withColumn(
            'trans_p_me', transientPowerMEFunc(
                new_df['sog'], new_df['design_speed'], new_df['inst_pow_me'])
        )
        new_df = new_df.withColumn(
            'trans_p_ae', transientPowerAEFunc(
                new_df['sog'], new_df['type'], new_df['inst_pow_ae'])
        )

    # Deactivate AE if the ship has been at berth more than 24h
    if ae_on_lim > 0:
        new_df = new_df.withColumn(
                "trans_p_ae",
                when(col('last_move') < ae_on_lim, col("trans_p_ae"))
                .otherwise(0))

    calcSOxEmissionFactorFunc = udf(calcSOxEmissionFactor, FloatType())
    calcCO2EmissionFactorFunc = udf(calcCO2EmissionFactor, FloatType())
    calcNOxEmissionFactorFunc = udf(calcNOxEmissionFactor, FloatType())

    # TODO: Maybe this shouldn't be a UDF
    estimateEmissionFunc = udf(
            lambda fact, pow: estimateEmission(fact, pow, step, unit),
            FloatType())

    # Emission factor calculation
    # TODO: Move this to R script
    new_df = new_df.withColumn(
            'sox_fact_me', calcSOxEmissionFactorFunc(new_df['sfoc_me']))
    new_df = new_df.withColumn(
            'sox_fact_ae', calcSOxEmissionFactorFunc(new_df['sfoc_ae']))
    new_df = new_df.withColumn(
            'co2_fact_me', calcCO2EmissionFactorFunc(new_df['sfoc_me']))
    new_df = new_df.withColumn(
            'co2_fact_ae', calcCO2EmissionFactorFunc(new_df['sfoc_ae']))
    new_df = new_df.withColumn(
            'nox_fact_me', calcNOxEmissionFactorFunc(new_df['me_rpm']))
    new_df = new_df.withColumn(
            'nox_fact_ae', calcNOxEmissionFactorFunc(new_df['ae_rpm']))

    # Emission calculation
    new_df = new_df.withColumn(
            'sox_me', estimateEmissionFunc(
                new_df['sox_fact_me'], new_df['trans_p_me']))
    new_df = new_df.withColumn(
            'sox_ae', estimateEmissionFunc(
                new_df['sox_fact_ae'], new_df['trans_p_ae']))
    new_df = new_df.withColumn(
            'co2_me', estimateEmissionFunc(
                new_df['co2_fact_me'], new_df['trans_p_me']))
    new_df = new_df.withColumn(
            'co2_ae', estimateEmissionFunc(
                new_df['co2_fact_ae'], new_df['trans_p_ae']))
    new_df = new_df.withColumn(
            'nox_me', estimateEmissionFunc(
                new_df['nox_fact_me'], new_df['trans_p_me']))
    new_df = new_df.withColumn(
            'nox_ae', estimateEmissionFunc(
                new_df['nox_fact_ae'], new_df['trans_p_ae']))

    new_df = ensure_columns_type(new_df)
    new_df.write.mode('overwrite').parquet(output_file)

    if 'mlflow' in sys.modules:
        prt_high("Logging MLFlow artifacts")
        prt_high("- emissions.parquet")
        mlflow.log_artifacts(output_file, "emissions.parquet")
        prt_high("- emissions summary")
        log_emission_summary(new_df)

    return
Exemplo n.º 10
0
def log_emission_summary_csv(emis, hdfs_path, plot_path):
    # Add ME and AE
    emis = emis\
            .withColumn('trans_p', col('trans_p_me') + col('trans_p_ae'))\
            .withColumn('nox', col('nox_me') + col('nox_ae'))\
            .withColumn('sox', col('sox_me') + col('sox_ae'))\
            .withColumn('co2', col('co2_me') + col('co2_ae'))

    # Log total
    all = emis.agg(
                sum(col('trans_p')).alias('total_trans_p'),
                sum(col('trans_p_me')).alias('total_trans_p_me'),
                sum(col('trans_p_ae')).alias('total_trans_p_ae'),
                sum(col('nox')).alias('total_nox'),
                sum(col('sox')).alias('total_sox'),
                sum(col('co2')).alias('total_co2'),
                sum(col('nox_me')).alias('total_nox_me'),
                sum(col('sox_me')).alias('total_sox_me'),
                sum(col('co2_ae')).alias('total_co2_me'),
                sum(col('nox_ae')).alias('total_nox_ae'),
                sum(col('sox_ae')).alias('total_sox_ae'),
                sum(col('co2_ae')).alias('total_co2_ae'))

    # Generate time features
    emis = emis.withColumn('time', emis.time.cast(dataType=TimestampType()))
    emis = emis\
        .withColumn('day', dayofyear(col('time')))\
        .withColumn('week', weekofyear(col('time')))\
        .withColumn('month', month(col('time')))\
        .withColumn('dayofweek', dayofweek(col('time'))).cache()

    day_df = group_emis(emis, 'day')
    week_df = group_emis(emis, 'week')
    month_df = group_emis(emis, 'month')
    dayofweek_df = group_emis(emis, 'dayofweek')

    prt_high("Generated summary. Logging it.")

    def save_csv(df, path):
        df\
            .coalesce(1)\
            .write.csv(path, header=True, mode='overwrite')

    # Saving CSVs in HDFS - coalesce(1) makes the csv to be saved as one file
    save_csv(all, hdfs_path+"/emis.csv")
    save_csv(day_df, hdfs_path+"/day.csv")
    save_csv(week_df, hdfs_path+"/week.csv")
    save_csv(month_df, hdfs_path+"/month.csv")
    save_csv(dayofweek_df, hdfs_path+"/dayofweek.csv")

    # Make directories
    mkdir_if_not_exist(plot_path)
    mkdir_if_not_exist(plot_path+"/plot")
    mkdir_if_not_exist(plot_path+"/plot/day")
    mkdir_if_not_exist(plot_path+"/plot/week")
    mkdir_if_not_exist(plot_path+"/plot/month")
    mkdir_if_not_exist(plot_path+"/plot/dayofweek")

    # Save plots in the container
    plot_summary(day_df.toPandas(), plot_path+"/plot/day", x="day_day")
    plot_summary(week_df.toPandas(), plot_path+"/plot/week", x="week_week")
    plot_summary(month_df.toPandas(), plot_path+"/plot/month", x="month_month")
    plot_summary(
            dayofweek_df.toPandas(), plot_path+"/plot/dayofweek",
            x="dayofweek_dayofweek")
Exemplo n.º 11
0
def analyze(spark: SparkSession,
            input_file='emissions.parquet',
            output_file='rasters.parquet',
            time_granularity=600,
            num_cols=100,
            cell_size=None,
            use_type=False):
    prt_high("""
            Running compute emissions.
            ##################################
            Parameters
             - Input file: {}
             - Output file: {}
             - Time granularity: {}
             - Cell size: {}
             - Number of raster columns: {}
             - Rasters by type: {}
            ##################################

            """.format(input_file, output_file, time_granularity, cell_size,
                       num_cols, use_type))

    cell_size_meters = None
    # Process parameters
    if cell_size is not None:
        prt_high("Info: Cell size set in parameters, using it")
        if cell_size[-1] == 'm':  # Meters
            cell_size_meters = int(cell_size[:-1])
            cell_size = meters_to_deg(cell_size_meters)
            # All except the last char
        else:
            cell_size = int(cell_size)
    else:
        num_cols = int(num_cols)
    time_granularity = int(time_granularity)

    # TODO: FILL NUM_VARS automatically
    prt_warn("WARNING: Number of variables is manually set to 10")
    num_vars = 10

    df = spark.read.parquet(input_file)

    min_max_lat_lon = df.agg(F.min(df.latitude), F.max(df.latitude),
                             F.min(df.longitude), F.max(df.longitude))
    min_max_time = df.agg(F.min(df.time), F.max(df.time))

    min_max_row = min_max_lat_lon.first()
    min_lat = min_max_row[0]
    max_lat = min_max_row[1]
    min_lon = min_max_row[2]
    max_lon = min_max_row[3]

    min_max_time = min_max_time.first()
    min_time = min_max_time[0]
    max_time = min_max_time[1]

    if cell_size is None:
        # Calculate cell dimension and number of rows using the number of
        # columns defined
        cell_size = (max_lon - min_lon) / num_cols
    else:
        num_cols = int(np.ceil((max_lon - min_lon) / cell_size))

    num_rows = int(np.ceil((max_lat - min_lat) / cell_size))
    prt_info("NUM COLS: " + str(num_cols))
    prt_info("NUM ROWS: " + str(num_rows))
    prt_info("CELL DIM: " + str(cell_size))

    # Create metadata file
    prt_info("Building metadata")
    meta = [(num_cols, num_rows, num_vars, min_lat, max_lat, min_lon, max_lon,
             cell_size, cell_size_meters, min_time, max_time, time_granularity)
            ]
    rdd = spark.sparkContext.parallelize(meta)
    metarow = rdd.map(lambda x: Row(num_cols=int(x[0]),
                                    num_rows=int(x[1]),
                                    num_vars=int(x[2]),
                                    min_lat=float(x[3]),
                                    max_lat=float(x[4]),
                                    min_lon=float(x[5]),
                                    max_lon=float(x[6]),
                                    cell_size=float(x[7]),
                                    cell_size_meters=float(x[8]),
                                    min_time=int(x[9]),
                                    max_time=int(x[10]),
                                    time_granularity=int(x[11])))
    metadf = spark.createDataFrame(metarow)

    # Create rasters
    prt_info("Building rasters")
    # TODO: Implement a way to define cell size
    to_cell = partial(lat_lon_to_cell, min_lon, min_lat, num_cols, num_rows,
                      cell_size)
    convertToCellFunc = udf(to_cell, IntegerType())

    to_hours = partial(to_time_resolution, time_granularity)
    convertToHours = udf(to_hours, IntegerType())

    df_cell = df.withColumn('cell',
                            convertToCellFunc(df['longitude'], df['latitude']))
    df_cell = df_cell.withColumn('hour', convertToHours(df['time']))

    # use desc order and first to get the last value
    # https://stackoverflow.com/questions/43114445/how-to-use-first-and-last-function-in-pyspark
    w = Window().partitionBy('imo', 'hour').orderBy(df_cell.time.desc())

    # lower than any possible sin/cos value so max only gets valid values

    df_cell = df_cell.withColumn('last_time', first("time").over(w))
    # PLACEHOLDER = -9999.0
    # df_cell = df_cell.withColumn('last_amp_v',
    #                             when(df_cell['last_time'] == df_cell['time'],
    #                                  df_cell['amp_v']).otherwise(PLACEHOLDER))
    # df_cell = df_cell.withColumn('last_cos',
    #                             when(df_cell['last_time'] == df_cell['time'],
    #                                  df_cell['cos_v']).otherwise(PLACEHOLDER))
    # df_cell = df_cell.withColumn('last_sin',
    #                             when(df_cell['last_time'] == df_cell['time'],
    #                                  df_cell['sin_v']).otherwise(PLACEHOLDER))

    # potential bug: max(last_amp_v), max('last_cos'), max('last_sin'). Each
    # may be from different ships if they happen to be in the same raster cell
    if use_type:
        raster = df_cell.groupBy("cell", "hour", "type")
    else:
        raster = df_cell.groupBy("cell", "hour")

    raster = raster.agg(
        F.sum("sox_me").alias("sox_me"),
        F.sum("sox_ae").alias("sox_ae"),
        F.sum('co2_me').alias('co2_me'),
        F.sum("co2_ae").alias("co2_ae"),
        F.sum("nox_me").alias("nox_me"),
        F.sum('nox_ae').alias('nox_ae'),
        # F.max('last_amp_v').alias('last_amp_v'),
        # F.max('last_cos').alias('last_cos'),
        # F.max('last_sin').alias('last_sin'),
        F.count('*').alias('sample_count'))

    # Lower limit for these attributes
    # raster = raster.withColumn(
    #    'last_amp_v', when(raster['last_amp_v'] <= (PLACEHOLDER + 1), 0)
    #    .otherwise(raster['last_amp_v']))
    # raster = raster.withColumn(
    #    'last_cos', when(raster['last_cos'] <= (PLACEHOLDER + 1), 0)
    #    .otherwise(raster['last_cos']))
    # raster = raster.withColumn(
    #    'last_sin', when(raster['last_sin'] <= (PLACEHOLDER + 1), 0)
    #    .otherwise(raster['last_sin']))

    raster = change_column_type(raster,
                                'sample_count',
                                IntegerType(),
                                force=True)
    raster = change_column_type(raster, 'sox_me', FloatType(), force=True)
    raster = change_column_type(raster, 'sox_ae', FloatType(), force=True)
    raster = change_column_type(raster, 'co2_me', FloatType(), force=True)
    raster = change_column_type(raster, 'co2_ae', FloatType(), force=True)
    raster = change_column_type(raster, 'nox_me', FloatType(), force=True)
    raster = change_column_type(raster, 'nox_ae', FloatType(), force=True)
    # raster = change_column_type(raster, 'last_amp_v', FloatType(),
    #                             force=True)
    # raster = change_column_type(raster, 'last_cos', FloatType(), force=True)
    # raster = change_column_type(raster, 'last_sin', FloatType(), force=True)

    raster = ensure_columns_type(raster)

    # Write rasters
    raster.write.mode('overwrite').parquet(output_file)

    # Write metadata
    metadf.write.mode('overwrite').parquet(output_file + '.meta')
    return
Exemplo n.º 12
0
def analyze(spark: SparkSession,
            input_data='emissions.parquet',
            db='ais',
            table='emis',
            host='localhost',
            port=5431,
            user='******',
            passwd='pass',
            table_type='ais',
            time_col='time',
            lon='longitude',
            lat='latitude',
            idx_fields="(imo, type)",
            ihs_table="ihs"):
    prt_high("""
            Running export to PostreSQL
            ##################################
            Parameters
             - Input file: {}
             - User: {}
             - Password: Censored :P
             - Database: {}
             - Host: {}
             - Table: {} (type: {})
             - Port: {}
             - Time column: {}
             - Latitude: {}
             - Longitude: {}
             - Idx Fields: {}
             - IHS table: {}
            ##################################
            """.format(input_data, user, db, host, table, table_type, port,
                       time_col, lat, lon, str(idx_fields), ihs_table))
    # Rename stage
    if 'mlflow' in sys.modules:
        mlflow.set_tag("mlflow.runName", "export_postgis_{}".format(table))

    # Connections
    mode = "overwrite"
    url = "jdbc:postgresql://{}:{}/{}".format(host, port, db)
    properties = {
        "user": user,
        "password": passwd,
        "driver": "org.postgresql.Driver",
    }

    prt_info("Processing parquet file")
    emis = spark.read.parquet(input_data)

    conn = psycopg2.connect(
        "dbname='{}' user='******' host='{}' password='******' port={}".format(
            db, user, host, passwd, port))
    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)  # Allow CREATE INDEX

    # Remove materialized view if exists
    if table_type == "ais" or table_type == "ihs":
        clean_table_and_derived(conn, table)

    # Write table
    if table_type != "ihs":
        emis = emis.withColumn(time_col,
                               col(time_col).cast(dataType=t.TimestampType()))
    prt_info("Exporting to JDBC")
    emis.write.jdbc(url=url, table=table, mode=mode, properties=properties)

    # Create indices
    conn = psycopg2.connect(
        "dbname='{}' user='******' host='{}' password='******' port={}".format(
            db, user, host, passwd, port))
    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)  # Allow CREATE INDEX

    if table_type != "ihs":
        add_indices(conn, table, idx_fields)
        add_geometry_field(conn, table, lon, lat)
        add_in_port_attr(conn, table)
        if table_type == "ais":
            create_ais_info_view(conn, table, ihs_table)

    conn.close()

    return
Exemplo n.º 13
0
def analyze(spark: SparkSession,
            input_file='rasters.parquet',
            output_folder='/tmp/img',
            units='kg'):
    prt_high("""
            Running image generation.")
            ##################################")
            Parameters")
             - Input file: {}
             - Output folder: {}
            """.format(input_file, output_folder))

    df = spark.read.parquet(input_file)
    meta = spark.read.parquet(input_file + '.meta').toPandas()

    pol_vars = ["sox_me", "sox_ae", "co2_me", "co2_ae", "nox_me", "nox_ae"]
    pol_vars.extend(['hour', 'cell'])

    n_cols = meta.num_cols[0]
    n_rows = meta.num_rows[0]

    # Define the transformation of data (Bounding box)
    transform = from_origin(meta.min_lon[0], meta.max_lat[0],
                            meta.cell_size[0], meta.cell_size[0])

    # Produce a GeoTIFF and PDF per pollutant (Including ME, AE and
    # joint(ME+AE))
    timestamps = df.select('hour').distinct().collect()
    for t in timestamps:
        # Generate the rasters for this timestep
        timestamp = t['hour']
        data = df.select(pol_vars).filter(df.hour == t['hour']).toPandas()
        r = pandas_to_raster(data, pol_vars, n_rows, n_cols)
        if units == 'kg':
            r = r / (1000 * meta.cell_size_meters[0]**2)
        prt_info("Calculated raster sum: ", r.sum(axis=(0, 1, 2)))
        # joint raster ME+AE
        # TODO: r_me_ae declaration can be done previously
        pol_vars_me_ae = ['sox_', 'co2_', 'nox_']
        shp = list(r.shape)
        shp[2] = shp[2] // 2
        shp = tuple(shp)
        r_me_ae = np.zeros(shp, dtype=np.float32)

        for i in range(0, len(pol_vars_me_ae)):
            r_me_ae[:, :, i] = r[:, :, i * 2] + r[:, :, i * 2 + 1]

        # Save the rasters to GeoTIFF
        file_path = output_folder + '/' + str(timestamp) + '/'
        try:
            os.makedirs(file_path)
        except OSError as e:
            prt_warn(str(e))
            prt_warn("Warning: folder exists" + file_path)
        for p in range(0, len(pol_vars)):
            raster_path = file_path + pol_vars[p]
            create_band_tiff(r, transform, p, raster_path + '.tif')

        for i in range(0, len(pol_vars_me_ae)):
            raster_path = file_path + pol_vars_me_ae[i]
            create_band_tiff(r_me_ae, transform, i, raster_path + '.tif')

    return
Exemplo n.º 14
0
            job (example: --job-args template=manual-email1 foo=bar")
        parser.add_argument(
            '--log', type=str, dest='log_level', default='WARN',
            help="Level of Spark logging (default = WARN).")
        parser.add_argument(
            '--extra-jars', type=str, dest='extra_jars', default='',
            help="Extra java jars to be added")
        parser.add_argument(
            '--hdfs', type=str, dest='hdfs', default='',
            help="HDFS endpoint")


        args = parser.parse_args()
        prt_info("Called with arguments: %s" % args)

        # Run main
        if 'mlflow' in sys.modules:
            prt_high("- Running with MLFlow")
            mlflow.start_run()  # Setting run_name in start run doesn't work
            mlflow.set_tag("mlflow.runName", args.job_name)
        main(args)
        if 'mlflow' in sys.modules:
            prt_high("MLFlow: Shutting down run.")
            mlflow.end_run()
    except KeyboardInterrupt:
        print('Interrupted :(')
        try:
            sys.exit(130)
        except SystemExit:
            os._exit(130)