# sc = SparkContext("Api_Spark_DF", "local[*]")


# funtion to save to RDBMS
def save_to_rdbms(tb_df, table, user, password, driver_format, url):
    tb_df.write \
        .format(driver_format) \
        .mode('overwrite') \
        .option('url', url) \
        .option('user', user) \
        .option('password', password) \
        .option('dbtable', table) \
        .save()

Schema = ty.StructType() \
         .add("date", ty.FloatType()) \
         .add("close", ty.FloatType()) \

# Schema = ty.StructType() \
#          .add("Date", ty.MapType(ty.StringType(), ty.StructType() \
#          .add("open", ty.FloatType()) \
#          .add("high", ty.FloatType()) \
#          .add("low", ty.FloatType()) \
#          .add("close", ty.FloatType()) \
#          .add("volume", ty.IntegerType())))

# data_schema = [StructField('open',FloatType(),True),
#  StructField('high',FloatType(),True),
#  StructField('low',FloatType(),True),
#  StructField('close',FloatType(),True),
#  StructField('volume',IntegerType(),True)]
Exemplo n.º 2
0
def parse(path_to_dir):
    if 'DAS5' in os.environ:  # If we want to execute it on the DAS-5 super computer
        print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] + ".ib.cluster"))
        spark = SparkSession.builder \
            .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "28G") \
            .config("spark.executor.cores", "8") \
            .config("spark.executor.instances", "10") \
            .config("spark.driver.memory", "40G") \
            .getOrCreate()
    else:
        findspark.init(spark_home="<path_to_spark>")
        spark = SparkSession.builder \
            .master("local[8]") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "20G") \
            .config("spark.driver.memory", "8G") \
            .getOrCreate()

    # Convert times which are in microseconds and do not fit in a long to milliseconds
    convert_micro_to_milliseconds = F.udf(lambda x: x / 1000)

    if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())):
        print("######\n Start parsing TaskState\n ######")
        task_usage_df = spark.read.format('com.databricks.spark.csv').options(mode="FAILFAST", inferschema="true").load(
            os.path.join(path_to_dir, 'task_usage', '*.csv'))
        # task_usage_df = spark.read.format('com.databricks.spark.csv').options(mode="FAILFAST", inferschema="true").load(
        #     'fake_task_usage.csv')
        oldColumns = task_usage_df.schema.names
        newColumns = ["ts_start",
                      "ts_end",
                      "workflow_id",
                      "id",
                      "resource_id",
                      "cpu_rate",
                      "memory_consumption",
                      "assigned_memory_usage",
                      "unmapped_page_cache",
                      "total_page_cache",
                      "max_memory_usage",
                      "mean_disk_io_time",
                      "mean_local_disk_space_usage",
                      "max_cpu_rate",
                      "max_disk_io_time",
                      "cycles_per_instruction",
                      "memory_accesses_per_instruction",
                      "sample_portion",
                      "aggregation_type",
                      "sampled_cpu_usage", ]

        task_usage_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]),
                               range(len(oldColumns)), task_usage_df)

        # Drop columns with too low level details
        task_usage_df = task_usage_df.drop('memory_accesses_per_instruction')
        task_usage_df = task_usage_df.drop('cycles_per_instruction')
        task_usage_df = task_usage_df.drop('unmapped_page_cache')
        task_usage_df = task_usage_df.drop('total_page_cache')

        # Conver the timestamps from micro to milliseconds and cast them to long.
        task_usage_df = task_usage_df.withColumn('ts_start', convert_micro_to_milliseconds(F.col('ts_start')))
        task_usage_df = task_usage_df.withColumn('ts_start', F.col('ts_start').cast(T.LongType()))
        task_usage_df = task_usage_df.withColumn('ts_end', convert_micro_to_milliseconds(F.col('ts_end')))
        task_usage_df = task_usage_df.withColumn('ts_end', F.col('ts_end').cast(T.LongType()))

        # Some fields have weird symbols in them, clean those.
        truncate_at_lt_symbol_udf = F.udf(lambda x: re.sub('[^0-9.eE\-+]', '', str(x)) if x is not None else x)
        task_usage_df = task_usage_df.withColumn('workflow_id', truncate_at_lt_symbol_udf(F.col('workflow_id')))
        task_usage_df = task_usage_df.withColumn('max_cpu_rate', truncate_at_lt_symbol_udf(F.col('max_cpu_rate')))

        # Now that the columns have been sanitized, cast them to the right type
        task_usage_df = task_usage_df.withColumn('workflow_id', F.col('workflow_id').cast(T.LongType()))
        task_usage_df = task_usage_df.withColumn('max_cpu_rate', F.col('max_cpu_rate').cast(T.FloatType()))

        task_usage_df.write.parquet(os.path.join(TARGET_DIR, TaskState.output_path()), mode="overwrite",
                                    compression="snappy")
        print("######\n Done parsing TaskState\n ######")

    if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())):

        if 'task_usage_df' not in locals():
            task_usage_df = spark.read.parquet(os.path.join(TARGET_DIR, TaskState.output_path()))

        print("######\n Start parsing Tasks\n ######")
        task_df = spark.read.format('com.databricks.spark.csv').options(inferschema="true", mode="FAILFAST",
                                                                        parserLib="univocity").load(
            os.path.join(path_to_dir, 'task_events', '*.csv'))

        oldColumns = task_df.schema.names
        newColumns = ["ts_submit",
                      "missing_info",
                      "workflow_id",
                      "id",
                      "resource_id",
                      "event_type",
                      "user_id",
                      "scheduler",
                      "nfrs",
                      "resources_requested",
                      "memory_requested",
                      "disk_space_request",
                      "machine_restrictions", ]

        task_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]),
                         range(len(oldColumns)), task_df)

        task_df = task_df.withColumn('ts_submit', convert_micro_to_milliseconds(F.col('ts_submit')))
        task_df = task_df.withColumn('ts_submit', F.col('ts_submit').cast(T.LongType()))

        # Filter tasks that never reached completion
        task_df.createOrReplaceTempView("task_table")
        task_df = spark.sql("""WITH filtered_tasks AS (
        SELECT DISTINCT t1.workflow_id AS workflow_id, t1.id AS id
            FROM task_table t1
            WHERE t1.event_type IN(0, 1, 4)
            group by t1.workflow_id, t1.id
            having count(distinct event_type) = 3
        )
    SELECT t.*
    FROM task_table t INNER JOIN filtered_tasks f
    ON t.id = f.id AND t.workflow_id = f.workflow_id""")

        task_aggregation_structtype = T.StructType([
            T.StructField("workflow_id", T.LongType(), True),
            T.StructField("id", T.LongType(), True),
            T.StructField("type", T.StringType(), True),
            T.StructField("ts_submit", T.LongType(), True),
            T.StructField("submission_site", T.LongType(), True),
            T.StructField("runtime", T.LongType(), True),
            T.StructField("resource_type", T.StringType(), True),
            T.StructField("resource_amount_requested", T.DoubleType(), True),
            T.StructField("parents", T.ArrayType(T.LongType()), True),
            T.StructField("children", T.ArrayType(T.LongType()), True),
            T.StructField("user_id", T.LongType(), True),
            T.StructField("group_id", T.LongType(), True),
            T.StructField("nfrs", T.StringType(), True),
            T.StructField("wait_time", T.LongType(), True),
            T.StructField("params", T.StringType(), True),
            T.StructField("memory_requested", T.DoubleType(), True),
            T.StructField("network_io_time", T.DoubleType(), True),
            T.StructField("disk_space_requested", T.DoubleType(), True),
            T.StructField("energy_consumption", T.DoubleType(), True),
            T.StructField("resource_used", T.StringType(), True),
        ])

        # Compute based on the event type
        @F.pandas_udf(returnType=task_aggregation_structtype, functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_aggregated_task_usage_metrics(df):
            def get_first_non_value_in_column(column_name):
                s = df[column_name]
                idx = s.first_valid_index()
                return s.loc[idx] if idx is not None else None

            task_workflow_id = get_first_non_value_in_column("workflow_id")
            task_id = get_first_non_value_in_column("id")

            task_submit_time = df[df['event_type'] == 0]['ts_submit'].min(skipna=True)
            task_start_time = df[df['event_type'] == 1]['ts_submit'].min(skipna=True)
            task_finish_time = df[df['event_type'] == 4]['ts_submit'].max(skipna=True)

            if None in [task_start_time, task_submit_time, task_finish_time]:
                return None

            task_resource_request = df['resources_requested'].max(skipna=True)
            task_memory_request = df['memory_requested'].max(skipna=True)
            task_priority = df['nfrs'].max(skipna=True)
            task_disk_space_requested = df['disk_space_request'].max(skipna=True)

            task_machine_id_list = df.resource_id.unique()

            task_waittime = int(task_start_time) - int(task_submit_time)
            task_runtime = int(task_finish_time) - int(task_start_time)

            def default(o):
                if isinstance(o, np.int64):
                    return int(o)

            data_dict = {
                "workflow_id": task_workflow_id,
                "id": task_id,
                "type": "",  # Unknown
                "ts_submit": task_submit_time,
                "submission_site": -1,  # Unknown
                "runtime": task_runtime,
                "resource_type": "core",  # Fields are called CPU, but they are core count (see Google documentation)
                "resource_amount_requested": task_resource_request,
                "parents": [],
                "children": [],
                "user_id": mmh3.hash64(get_first_non_value_in_column("user_id"))[0],
                "group_id": -1,
                "nfrs": json.dumps({"priority": task_priority}, default=default),
                "wait_time": task_waittime,
                "params": "{}",
                "memory_requested": task_memory_request,
                "network_io_time": -1,  # Unknown
                "disk_space_requested": task_disk_space_requested,
                "energy_consumption": -1,  # Unknown
                "resource_used": json.dumps(task_machine_id_list, default=default),
            }

            return pd.DataFrame(data_dict, index=[0])

        task_df = task_df.groupBy(["workflow_id", "id"]).apply(compute_aggregated_task_usage_metrics)
        task_df.explain(True)

        # Now add disk IO time - This cannot be done in the previous Pandas UDF function as
        # accessing another dataframe in the apply function is not allowed
        disk_io_structtype = T.StructType([
            T.StructField("workflow_id", T.LongType(), True),
            T.StructField("id", T.LongType(), True),
            T.StructField("disk_io_time", T.DoubleType(), True),
        ])

        @F.pandas_udf(returnType=disk_io_structtype, functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_disk_io_time(df):
            def get_first_non_value_in_column(column_name):
                s = df[column_name]
                idx = s.first_valid_index()
                return s.loc[idx] if idx is not None else None

            task_workflow_id = get_first_non_value_in_column("workflow_id")
            task_id = get_first_non_value_in_column("id")

            disk_io_time = ((df['ts_end'] - df['ts_start']) * df['mean_disk_io_time']).sum(skipna=True) / 1000
            data_dict = {
                "workflow_id": task_workflow_id,
                "id": task_id,
                "disk_io_time": disk_io_time
            }

            return pd.DataFrame(data_dict, index=[0])

        disk_io_df = task_usage_df.select(['workflow_id', 'id', 'mean_disk_io_time', 'ts_end', 'ts_start']).groupBy(
            ["workflow_id", "id"]).apply(compute_disk_io_time)
        disk_io_df.explain(True)

        join_condition = (task_df.workflow_id == disk_io_df.workflow_id) & (task_df.id == disk_io_df.id)
        task_df = task_df.join(disk_io_df, ["workflow_id", "id"])

        task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite", compression="snappy")
        print("######\n Done parsing Tasks\n ######")
    else:
        task_df = spark.read.parquet(os.path.join(TARGET_DIR, Task.output_path()))

    if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())):
        print("######\n Start parsing Resource\n ######")
        # Parse the machine information in the traces, these should match with the resource_ids in task_usage
        resources_structtype = T.StructType([  # Using StringTypes as we drop those columns
            T.StructField("time", T.StringType(), False),
            T.StructField("id", T.LongType(), False),
            T.StructField("attribute_name", T.StringType(), False),
            T.StructField("attribute_value", T.StringType(), False),
            T.StructField("attribute_deleted", T.StringType(), False),
        ])

        resource_df = spark.read.format('com.databricks.spark.csv').schema(resources_structtype).options(
            mode="FAILFAST").load(os.path.join(path_to_dir, 'machine_attributes', '*.csv'))

        resource_df = resource_df.select(["id"])  # Only keep the ID, the rest we do not need.

        # Since the information in the traces is completely opaque, we use the educated guess from Amvrosiadis et al.
        # in their ATC 2018 article.
        resource_df = resource_df.withColumn('type', F.lit("core"))
        resource_df = resource_df.withColumn('num_resources', F.lit(8))
        resource_df = resource_df.withColumn('proc_model', F.lit("AMD Opteron Barcelona"))
        resource_df = resource_df.withColumn('memory', F.lit(-1))
        resource_df = resource_df.withColumn('disk_space', F.lit(-1))
        resource_df = resource_df.withColumn('network', F.lit(-1))
        resource_df = resource_df.withColumn('os', F.lit(""))
        resource_df = resource_df.withColumn('details', F.lit("{}"))

        # Write the resource_df to the specified location
        resource_df.write.parquet(os.path.join(TARGET_DIR, Resource.output_path()), mode="overwrite",
                                  compression="snappy")
        print("######\n Done parsing Resource\n ######")

    if not os.path.exists(os.path.join(TARGET_DIR, ResourceState.output_path())):
        print("######\n Start parsing ResourceState\n ######")
        resource_events_structtype = T.StructType([
            T.StructField("timestamp", T.DecimalType(20, 0), False),
            T.StructField("machine_id", T.LongType(), False),
            T.StructField("event_type", T.IntegerType(), False),
            T.StructField("platform_id", T.StringType(), False),
            T.StructField("available_resources", T.FloatType(), False),
            T.StructField("available_memory", T.FloatType(), False),
        ])

        resource_event_df = spark.read.format('com.databricks.spark.csv').schema(resource_events_structtype).options(
            mode="FAILFAST").load(os.path.join(path_to_dir, 'machine_events', '*.csv'))

        resource_event_df = resource_event_df.withColumn('timestamp', convert_micro_to_milliseconds(F.col('timestamp')))
        resource_event_df = resource_event_df.withColumn('timestamp', F.col('timestamp').cast(T.LongType()))

        resource_event_df = resource_event_df.withColumn('available_disk_space', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('available_disk_io_bandwidth', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('available_network_bandwidth', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('average_load_1_minute', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('average_load_5_minute', F.lit(-1))
        resource_event_df = resource_event_df.withColumn('average_load_15_minute', F.lit(-1))

        # Write the resource_df to the specified location
        resource_event_df.write.parquet(os.path.join(TARGET_DIR, ResourceState.output_path()), mode="overwrite",
                                        compression="snappy")
        print("######\n Done parsing ResourceState\n ######")

    if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())):
        print("######\n Start parsing Workflows\n ######")
        workflow_structype = T.StructType([
            T.StructField("id", T.LongType(), False),
            T.StructField("ts_submit", T.LongType(), False),
            T.StructField("task_count", T.IntegerType(), False),
            T.StructField("critical_path_length", T.LongType(), False),
            T.StructField("critical_path_task_count", T.IntegerType(), False),
            T.StructField("approx_max_concurrent_tasks", T.IntegerType(), False),
            T.StructField("nfrs", T.StringType(), False),
            T.StructField("scheduler", T.StringType(), False),
            T.StructField("total_resources", T.DoubleType(), False),
            T.StructField("total_memory_usage", T.DoubleType(), False),
            T.StructField("total_network_usage", T.LongType(), False),
            T.StructField("total_disk_space_usage", T.LongType(), False),
            T.StructField("total_energy_consumption", T.LongType(), False),
        ])

        @F.pandas_udf(returnType=workflow_structype, functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_workflow_stats(df):
            id = df['workflow_id'].iloc[0]
            ts_submit = df['ts_submit'].min()
            task_count = len(df)
            critical_path_length = -1  # We do not know the task dependencies, so -1
            critical_path_task_count = -1
            approx_max_concurrent_tasks = -1
            nfrs = "{}"
            scheduler = ""
            total_resources = df['resource_amount_requested'].sum()  # TODO or assigned?
            total_memory_usage = df['memory_requested'].sum()  # TODO or consumption, or assigned?
            total_network_usage = -1
            total_disk_space_usage = -1
            total_energy_consumption = -1

            data_dict = {
                "id": id, "ts_submit": ts_submit, 'task_count': task_count,
                'critical_path_length': critical_path_length,
                'critical_path_task_count': critical_path_task_count,
                'approx_max_concurrent_tasks': approx_max_concurrent_tasks, 'nfrs': nfrs, 'scheduler': scheduler,
                'total_resources': total_resources, 'total_memory_usage': total_memory_usage,
                'total_network_usage': total_network_usage, 'total_disk_space_usage': total_disk_space_usage,
                'total_energy_consumption': total_energy_consumption
            }

            return pd.DataFrame(data_dict, index=[0])

        # Create and write the workflow dataframe
        workflow_df = task_df.groupBy('workflow_id').apply(compute_workflow_stats)

        workflow_df.write.parquet(os.path.join(TARGET_DIR, Workflow.output_path()), mode="overwrite",
                                  compression="snappy")
        print("######\n Done parsing Workflows\n ######")

    print("######\n Start parsing Workload\n ######")
    json_dict = Workload.get_json_dict_from_spark_task_dataframe(task_df,
                                                                 domain="Industrial",
                                                                 start_date="2011-05-01",
                                                                 end_date="2011-05-30",
                                                                 authors=["Google"])

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True)
    with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64):
                return int(o)

        file.write(json.dumps(json_dict, default=default))
    print("######\n Done parsing Workload\n ######")
Exemplo n.º 3
0
REG = 0.1

lr = LogisticRegression(featuresCol="features", labelCol='toxic', regParam=REG)

tfidf.show(5)

lrModel = lr.fit(tfidf.limit(5000))

res_train = lrModel.transform(tfidf)

res_train.select("id", "toxic", "probability", "prediction").show(20)

res_train.show(5)

extract_prob = F.udf(lambda x: float(x[1]), T.FloatType())

(res_train.withColumn("proba", extract_prob("probability")).select(
    "proba", "prediction").show())

test_tokens = tokenizer.transform(test)
test_tf = hashingTF.transform(test_tokens)
test_tfidf = idfModel.transform(test_tf)

test_res = test.select('id')
test_res.head()

test_probs = []
for col in out_cols:
    print(col)
    lr = LogisticRegression(featuresCol="features", labelCol=col, regParam=REG)
Exemplo n.º 4
0
import sys
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('first Spark app').getOrCreate()

assert sys.version_info >= (3, 4)  # make sure we have Python 3.4+
assert spark.version >= '2.1'  # make sure we have Spark 2.1+

schema = types.StructType([
    types.StructField('id', types.IntegerType(), False),
    types.StructField('x', types.FloatType(), False),
    types.StructField('y', types.FloatType(), False),
    types.StructField('z', types.FloatType(), False),
])


def main(in_directory, out_directory):
    # Read the data from the JSON files
    xyz = spark.read.json(in_directory, schema=schema)
    #xyz.show(); return

    # Create a DF with what we need: x, (soon y,) and id%10 which we'll aggregate by.
    with_bins = xyz.select(
        xyz['x'],
        # TODO: also the y values
        xyz['y'],
        (xyz['id'] % 10).alias('bin'),
    )
    #with_bins.show(); return

    # Aggregate by the bin number.
Exemplo n.º 5
0
def get_common_spark_testing_client(data_directory, connect):
    pytest.importorskip('pyspark')
    import pyspark.sql.types as pt
    from pyspark.sql import SparkSession

    spark = SparkSession.builder.getOrCreate()
    _spark_testing_client = connect(spark)
    s = _spark_testing_client._session

    df_functional_alltypes = s.read.csv(
        path=str(data_directory / 'functional_alltypes.csv'),
        schema=pt.StructType([
            pt.StructField('index', pt.IntegerType(), True),
            pt.StructField('Unnamed: 0', pt.IntegerType(), True),
            pt.StructField('id', pt.IntegerType(), True),
            # cast below, Spark can't read 0/1 as bool
            pt.StructField('bool_col', pt.ByteType(), True),
            pt.StructField('tinyint_col', pt.ByteType(), True),
            pt.StructField('smallint_col', pt.ShortType(), True),
            pt.StructField('int_col', pt.IntegerType(), True),
            pt.StructField('bigint_col', pt.LongType(), True),
            pt.StructField('float_col', pt.FloatType(), True),
            pt.StructField('double_col', pt.DoubleType(), True),
            pt.StructField('date_string_col', pt.StringType(), True),
            pt.StructField('string_col', pt.StringType(), True),
            pt.StructField('timestamp_col', pt.TimestampType(), True),
            pt.StructField('year', pt.IntegerType(), True),
            pt.StructField('month', pt.IntegerType(), True),
        ]),
        mode='FAILFAST',
        header=True,
    )
    df_functional_alltypes = df_functional_alltypes.withColumn(
        "bool_col", df_functional_alltypes["bool_col"].cast("boolean"))
    df_functional_alltypes.createOrReplaceTempView('functional_alltypes')

    df_batting = s.read.csv(
        path=str(data_directory / 'batting.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('stint', pt.IntegerType(), True),
            pt.StructField('teamID', pt.StringType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('G', pt.IntegerType(), True),
            pt.StructField('AB', pt.DoubleType(), True),
            pt.StructField('R', pt.DoubleType(), True),
            pt.StructField('H', pt.DoubleType(), True),
            pt.StructField('X2B', pt.DoubleType(), True),
            pt.StructField('X3B', pt.DoubleType(), True),
            pt.StructField('HR', pt.DoubleType(), True),
            pt.StructField('RBI', pt.DoubleType(), True),
            pt.StructField('SB', pt.DoubleType(), True),
            pt.StructField('CS', pt.DoubleType(), True),
            pt.StructField('BB', pt.DoubleType(), True),
            pt.StructField('SO', pt.DoubleType(), True),
            pt.StructField('IBB', pt.DoubleType(), True),
            pt.StructField('HBP', pt.DoubleType(), True),
            pt.StructField('SH', pt.DoubleType(), True),
            pt.StructField('SF', pt.DoubleType(), True),
            pt.StructField('GIDP', pt.DoubleType(), True),
        ]),
        header=True,
    )
    df_batting.createOrReplaceTempView('batting')

    df_awards_players = s.read.csv(
        path=str(data_directory / 'awards_players.csv'),
        schema=pt.StructType([
            pt.StructField('playerID', pt.StringType(), True),
            pt.StructField('awardID', pt.StringType(), True),
            pt.StructField('yearID', pt.IntegerType(), True),
            pt.StructField('lgID', pt.StringType(), True),
            pt.StructField('tie', pt.StringType(), True),
            pt.StructField('notes', pt.StringType(), True),
        ]),
        header=True,
    )
    df_awards_players.createOrReplaceTempView('awards_players')

    df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar'])
    df_simple.createOrReplaceTempView('simple')

    df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col'])
    df_struct.createOrReplaceTempView('struct')

    df_nested_types = s.createDataFrame(
        [([1, 2], [[3, 4], [5, 6]], {
            'a': [[2, 4], [3, 5]]
        })],
        [
            'list_of_ints',
            'list_of_list_of_ints',
            'map_string_list_of_list_of_ints',
        ],
    )
    df_nested_types.createOrReplaceTempView('nested_types')

    df_complicated = s.createDataFrame([({
        (1, 3): [[2, 4], [3, 5]]
    }, )], ['map_tuple_list_of_list_of_ints'])
    df_complicated.createOrReplaceTempView('complicated')

    df_udf = s.createDataFrame(
        [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')],
        ['a', 'b', 'c', 'key'],
    )
    df_udf.createOrReplaceTempView('udf')

    df_udf_nan = s.createDataFrame(
        pd.DataFrame({
            'a': np.arange(10, dtype=float),
            'b': [3.0, np.NaN] * 5,
            'key': list('ddeefffggh'),
        }))
    df_udf_nan.createOrReplaceTempView('udf_nan')

    df_udf_null = s.createDataFrame(
        [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i])
         for i in range(10)],
        ['a', 'b', 'key'],
    )
    df_udf_null.createOrReplaceTempView('udf_null')

    df_udf_random = s.createDataFrame(
        pd.DataFrame({
            'a':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'b':
            np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(),
            'key':
            list('ddeefff'),
        }))
    df_udf_random.createOrReplaceTempView('udf_random')

    return _spark_testing_client
Exemplo n.º 6
0
def as_spark_type(tpe: Union[str, type, Dtype], *, raise_error: bool = True) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    if isinstance(tpe, np.dtype) and tpe == np.dtype("object"):
        pass
    # ArrayType
    elif tpe in (np.ndarray,):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list):  # type: ignore
        element_type = as_spark_type(tpe.__args__[0], raise_error=raise_error)  # type: ignore
        if element_type is None:
            return None
        return types.ArrayType(element_type)
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date,):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal,):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int, np.int64, "int", "int64", "long"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampType()

    # categorical types
    elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str) and type == "category"):
        return types.LongType()

    # extension types
    elif extension_dtypes_available:
        # IntegralType
        if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str) and tpe == "Int8"):
            return types.ByteType()
        elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str) and tpe == "Int16"):
            return types.ShortType()
        elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str) and tpe == "Int32"):
            return types.IntegerType()
        elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str) and tpe == "Int64"):
            return types.LongType()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str) and tpe == "boolean"):
                return types.BooleanType()
            # StringType
            elif isinstance(tpe, StringDtype) or (isinstance(tpe, str) and tpe == "string"):
                return types.StringType()

        if extension_float_dtypes_available:
            # FractionalType
            if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str) and tpe == "Float32"):
                return types.FloatType()
            elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and tpe == "Float64"):
                return types.DoubleType()

    if raise_error:
        raise TypeError("Type %s was not understood." % tpe)
    else:
        return None
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+

from pyspark.sql import SparkSession, functions, types
spark = SparkSession.builder.appName('weather prediction').getOrCreate()
spark.sparkContext.setLogLevel('WARN')
assert spark.version >= '2.3' # make sure we have Spark 2.3+

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, SQLTransformer
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

tmax_schema = types.StructType([
    types.StructField('station', types.StringType()),
    types.StructField('date', types.DateType()),
    types.StructField('latitude', types.FloatType()),
    types.StructField('longitude', types.FloatType()),
    types.StructField('elevation', types.FloatType()),
    types.StructField('tmax', types.DoubleType()),
])

def main(inputs, model_file):
    data = spark.read.csv(inputs, schema=tmax_schema)
    data.registerTempTable('yesterday')
    #wthr_query = """SELECT  dayofyear(date) as dayofyr, latitude, longitude, elevation,tmax  FROM __THIS__"""
    wthr_query = """SELECT dayofyear(today.date) as dayofyr,today.latitude, today.longitude, today.elevation, today.tmax, yesterday.tmax as yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station"""
    
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()
Exemplo n.º 8
0
def getPreprocessingDataframe(df, nlp, version='dev'):
    if version == 'dev':
        df = df.orderBy(rand())
        df2 = df.filter(df.Timestamp.isNotNull())
        df2 = df2.limit(500)
    else:
        df = df.orderBy(rand())
        df2 = df.filter(df.Timestamp.isNotNull())

# convert timestamp to the right format
    timeStampPreCleaning = udf(
        lambda x: str(x) + " 2020" if len(x) < 8 else x.replace(",", ""),
        StringType())
    df2 = df2.withColumn("Timestamp", timeStampPreCleaning("Timestamp"))

    # StirngToDateType
    df3 = df2.withColumn("TimeStampDateType",
                         F.to_date(F.col("Timestamp"), "MMM dd yyyy"))

    # drop null value rows which timestamp columns is not in the standard format.
    df3 = df3.filter(df3.Text.isNotNull())
    df3 = df3.filter(df3.TimeStampDateType.isNotNull())
    df3 = df3.withColumn("Year", F.year(df3.TimeStampDateType))
    df3 = df3.withColumn("Month", F.month(df3.TimeStampDateType))
    df3 = df3.withColumn("Qurter", F.quarter(df3.TimeStampDateType))
    # fill null with 0 and convert unit to the right numbers.
    cols = ["Comments", "Likes", "Retweets"]

    df3 = df3.fillna("0", subset=cols)

    # apply the transform_number udf
    transformNumber = udf(lambda z: transform_number(z), T.IntegerType())
    df3 = df3.withColumn("Comments", transformNumber("Comments"))
    df3 = df3.withColumn("Likes", transformNumber("Likes"))
    df3 = df3.withColumn("Retweets", transformNumber("Retweets"))
    ### check
    logNormal = udf(lambda x: int(round(np.log2(x + 1))) + 1, T.IntegerType())
    df3 = df3.withColumn("Likes_log", logNormal("Likes"))
    df3 = df3.withColumn("Retweets_log", logNormal("Retweets"))
    # df3 = df3.filter(df3.Likes_log.isNotNull())
    df3 = df3.filter(df3.Retweets_log.isNotNull())

    extractKeywordFromQueries = udf(lambda x: extractkeyword(x))
    df3 = df3.filter(df3.Page_URL.isNotNull())
    df3 = df3.withColumn("Keyword", extractKeywordFromQueries("Page_URL"))
    df3 = df3.filter(df3.Keyword.isNotNull())

    keywordToCategory2 = udf(lambda x: getCategory2(x), StringType())
    df3 = df3.withColumn("Category2", keywordToCategory2("Keyword"))

    # NER Model
    # could be empty list,
    nerExtraction = udf(lambda z: ner_extraction(z, nlp),
                        T.ArrayType(StringType()))

    df3 = df3.withColumn("All_phrases", nerExtraction("Text"))
    df3 = df3.filter(df3.All_phrases.isNotNull())

    checkEmpty = udf(lambda x: checkempty(x), T.IntegerType())

    df3 = df3.withColumn('CheckEmpty', checkEmpty('All_phrases'))
    df3 = df3.filter(df3.CheckEmpty.isNotNull())

    df3 = df3.filter(df3.CheckEmpty != int(1))

    sentiment = VaderSentiment()
    vader_sentiment = udf(sentiment.score, T.FloatType())
    df3 = df3.withColumn("Sentiment", vader_sentiment('Text'))

    weighted_phrases_calculate = udf(lambda x, y: y * (int(x) + 1),
                                     T.ArrayType(StringType()))

    df3 = df3.withColumn(
        "Weighted_phrases",
        weighted_phrases_calculate("Retweets_log", "All_phrases"))

    # cols = ['Sentiment','All_phrases','Retweets_log','Weighted_phrases','Year','Month','Keyword']
    cols = ["Weighted_phrases", "Year", "Month", "Keyword", "Category2"]

    weighted_phrases_calculate = udf(lambda x, y: y * (int(x) + 1),
                                     T.FloatType())

    # get the weighted sentiments for each tweets.
    df3 = df3.withColumn("Weighted_Sentiment",
                         weighted_phrases_calculate("Likes_log", "Sentiment"))
    return df3
Exemplo n.º 9
0
def cast_columns(df, cols):
    for col in cols:
        df = df.withColumn(col, F.coalesce(df[col].cast(T.FloatType()), F.lit(0.0)))
    return df
Exemplo n.º 10
0
from pyspark.sql import SparkSession, types

movie_schema = types.StructType([
    types.StructField('imdb_id', types.StringType(), True),
    types.StructField('title', types.StringType(), True),
    types.StructField('year', types.StringType(), True),
    types.StructField('genre', types.StringType(), True),
    types.StructField('country', types.StringType(), True),
    types.StructField('language', types.StringType(), True),
    types.StructField('imdb_score', types.FloatType(), True),
    types.StructField('meta_score', types.IntegerType(), True),
    types.StructField('votes', types.IntegerType(), True),
    types.StructField('director', types.StringType(), True),
    types.StructField('stars', types.StringType(), True),
    types.StructField('description', types.StringType(), True),
    types.StructField('image', types.StringType(), True),
    types.StructField('runtimemins', types.IntegerType(), True)
    ])

tvshow_schema = types.StructType([
    types.StructField('imdb_id', types.StringType(), True),
    types.StructField('title', types.StringType(), True),
    types.StructField('genre', types.StringType(), True),
    types.StructField('country', types.StringType(), True),
    types.StructField('language', types.StringType(), True),
    types.StructField('imdb_score', types.FloatType(), True),
    types.StructField('meta_score', types.IntegerType(), True),
    types.StructField('votes', types.IntegerType(), True),
    types.StructField('director', types.StringType(), True),
    types.StructField('stars', types.StringType(), True),
    types.StructField('description', types.StringType(), True),
Exemplo n.º 11
0
udf_morphy = functions.udf(py_morphy,
                           returnType=types.ArrayType(types.StringType()))


def sentiment_score(text):
    list_text = text.split('.')
    s = SentimentIntensityAnalyzer()
    list_scores = []
    for sentence in list_text:
        list_scores.append(s.polarity_scores(sentence)['compound'])
    return list_scores


udf_sentiment_score = functions.udf(sentiment_score,
                                    returnType=types.ArrayType(
                                        types.FloatType()))


def main(topic):
    # 1. Load Data, Combine keywords, tweet_urls by news_url, Add id
    messages = spark.readStream.format('kafka') \
        .option('kafka.bootstrap.servers', 'localhost:9092') \
        .option('subscribe', topic)\
        .option('failOnDataLoss', 'false')\
        .option('auto.offset.reset', 'earliest')\
        .load()
    values = messages.select(messages['value'].cast('string'))
    words = values.select(
        functions.explode(functions.split(values.value, ';')).alias("words"))
    data = words.withColumn('text', functions.split('words',
                                                    ',')).select('text')
Exemplo n.º 12
0
def as_spark_type(tpe: Union[str, type, Dtype],
                  *,
                  raise_error: bool = True,
                  prefer_timestamp_ntz: bool = False) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
    if sys.version_info >= (3, 8) and LooseVersion(
            np.__version__) >= LooseVersion("1.21"):
        if (hasattr(tpe, "__origin__")
                and tpe.__origin__ is np.ndarray  # type: ignore[union-attr]
                and hasattr(tpe, "__args__")
                and len(tpe.__args__) > 1  # type: ignore[union-attr]
            ):
            # numpy.typing.NDArray
            return types.ArrayType(
                as_spark_type(
                    tpe.__args__[1].__args__[0],
                    raise_error=raise_error  # type: ignore[union-attr]
                ))

    if isinstance(tpe, np.dtype) and tpe == np.dtype("object"):
        pass
    # ArrayType
    elif tpe in (np.ndarray, ):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(
            tpe.__origin__,
            list  # type: ignore[union-attr]
    ):
        element_type = as_spark_type(
            tpe.__args__[0],
            raise_error=raise_error  # type: ignore[union-attr]
        )
        if element_type is None:
            return None
        return types.ArrayType(element_type)
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool_, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date, ):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal, ):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float_, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int64, "int", "int64", "long"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType or TimestampNTZType if timezone is not specified.
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampNTZType(
        ) if prefer_timestamp_ntz else types.TimestampType()

    # DayTimeIntervalType
    elif tpe in (datetime.timedelta, np.timedelta64, "timedelta64[ns]"):
        return types.DayTimeIntervalType()

    # categorical types
    elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str)
                                               and type == "category"):
        return types.LongType()

    # extension types
    elif extension_dtypes_available:
        # IntegralType
        if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str)
                                          and tpe == "Int8"):
            return types.ByteType()
        elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int16"):
            return types.ShortType()
        elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int32"):
            return types.IntegerType()
        elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int64"):
            return types.LongType()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str)
                                                 and tpe == "boolean"):
                return types.BooleanType()
            # StringType
            elif isinstance(tpe, StringDtype) or (isinstance(tpe, str)
                                                  and tpe == "string"):
                return types.StringType()

        if extension_float_dtypes_available:
            # FractionalType
            if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str)
                                                 and tpe == "Float32"):
                return types.FloatType()
            elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str)
                                                   and tpe == "Float64"):
                return types.DoubleType()

    if raise_error:
        raise TypeError("Type %s was not understood." % tpe)
    else:
        return None
Exemplo n.º 13
0
import sys
from pyspark.sql import SparkSession, functions as f, types
from pyspark.sql.functions import monotonically_increasing_id

spark = SparkSession.builder.appName('amenity data cleaning').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
assert spark.version >= '2.3'  # make sure we have Spark 2.3+
''' RUN: spark-submit code/amenities_data_cleaning.py data/amenities-vancouver.json.gz cleaned-data-amenities '''

schema = types.StructType([
    types.StructField('amenity', types.StringType()),
    types.StructField('lat', types.FloatType()),
    types.StructField('lon', types.FloatType()),
    types.StructField('name', types.StringType()),
    types.StructField('tags', types.StringType()),
    types.StructField('timestamp', types.TimestampType()),
])


def main(inp, outp):
    data = spark.read.json(inp, schema=schema)

    #data.select('amenity').distinct().sort('amenity').show()

    # Interesting amenities
    amnt = [
        'Observation Platform', 'arts_centre', 'atm', 'atm;bank', 'bank',
        'bar', 'bbq', 'bicycle_rental', 'biergarten', 'bistro', 'boat_rental',
        'bureau_de_change', 'bus_station', 'cafe', 'car_rental', 'car_sharing',
    types.StructField('UOM_ID', types.StringType(), True),
    types.StructField('SCALAR_FACTOR', types.StringType(), True),
    types.StructField('SCALAR_ID', types.StringType(), True),
    types.StructField('VECTOR', types.StringType(), True),
    types.StructField('COORDINATE', types.StringType(), True),
    types.StructField('VALUE', types.StringType(), True),
    types.StructField('STATUS', types.StringType(), True),
    types.StructField('SYMBOL', types.StringType(), True),
    types.StructField('TERMINATE', types.StringType(), True),
    types.StructField('DECIMALS', types.StringType(), True),
])

labource_charact_schema = types.StructType([
    types.StructField('REF_DATE', types.StringType(), True),
    types.StructField('GEO', types.StringType(), True),
    types.StructField('Employment', types.FloatType(), True),
    types.StructField('Employment_rate', types.FloatType(), True),
    types.StructField('Full_time_employment', types.FloatType(), True),
    types.StructField('Labour_force', types.FloatType(), True),
    types.StructField('Part_time_employment', types.FloatType(), True),
    types.StructField('Participation_rate', types.FloatType(), True),
    types.StructField('Population', types.FloatType(), True),
    types.StructField('Unemployment', types.FloatType(), True),
    types.StructField('Unemployment_rate', types.FloatType(), True),
])


# dtype={"REF_DATE": str, "GEO": str, "DGUID":str , "Labour force characteristics":str, "Sex":str, "Age group":str, \
#"Statistics":str, "Data type":str, "UOM":str, "UOM_ID":int, "SCALAR_FACTOR":str, "SCALAR_ID":int, "VECTOR":str, "COORDINATE":str, "VALUE":str, "STATUS":str, \
#"SYMBOL":str, "TERMINATE":str, "DECIMALS":int}
def download_extract_zip(url):
Exemplo n.º 15
0
def main(base_path):

    spark = (SparkSession.builder.config(
        "spark.default.parallelism",
        1).config("spark.jars.packages",
                  "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0").appName(
                      APP_NAME).getOrCreate())

    #
    # Load all models to be used in making predictions
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer

    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler

    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import (
        RandomForestClassifier,
        RandomForestClassificationModel,
    )

    random_forest_model_path = (
        "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".
        format(base_path))
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Messages look like:
    #

    # {
    #   "Carrier": "DL",
    #   "DayOfMonth": 25,
    #   "DayOfWeek": 4,
    #   "DayOfYear": 359,
    #   "DepDelay": 10.0,
    #   "Dest": "LAX",
    #   "Distance": 2475.0,
    #   "FlightDate": "2015-12-25",
    #   "FlightNum": null,
    #   "Origin": "JFK",
    #   "Timestamp": "2019-10-31T00:19:47.633280",
    #   "UUID": "af74b096-ecc7-4493-a79a-ebcdff699385"
    # }

    #
    # Process Prediction Requests from Kafka
    #
    message_df = (spark.readStream.format("kafka").option(
        "kafka.bootstrap.servers",
        BROKERS).option("subscribe",
                        PREDICTION_TOPIC).option("startingOffsets",
                                                 "earliest").load())

    # Create a DataFrame out of the one-hot encoded RDD
    schema = T.StructType([
        T.StructField("Carrier", T.StringType()),
        T.StructField("DayOfMonth", T.IntegerType()),
        T.StructField("DayOfWeek", T.IntegerType()),
        T.StructField("DayOfYear", T.IntegerType()),
        T.StructField("DepDelay", T.FloatType()),
        T.StructField("Dest", T.StringType()),
        T.StructField("Distance", T.FloatType()),
        T.StructField("FlightDate", T.StringType()),
        T.StructField("FlightNum", T.StringType()),
        T.StructField("Origin", T.StringType()),
        T.StructField("Timestamp", T.TimestampType()),
        T.StructField("UUID", T.StringType()),
    ])

    prediction_requests_df = message_df.select(
        F.from_json(F.col("value").cast("string"),
                    schema).alias("data")).select("data.*")

    #
    # Add a Route variable to replace FlightNum
    #
    prediction_requests_with_route = prediction_requests_df.withColumn(
        "Route",
        F.concat(prediction_requests_df.Origin, F.lit("-"),
                 prediction_requests_df.Dest),
    )

    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model = string_indexer_models[column]
        prediction_requests_with_route = string_indexer_model.transform(
            prediction_requests_with_route)

    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(
        prediction_requests_with_route)

    # Drop the individual index columns
    index_columns = [
        "Carrier_index", "Origin_index", "Dest_index", "Route_index"
    ]
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)

    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = (predictions.drop("indices").drop("values").drop(
        "rawPrediction").drop("probability"))

    # Store the results to MongoDB
    class MongoWriter:
        def open(self, partition_id, epoch_id):
            print(f"Opened partition id: {partition_id}, epoch: {epoch_id}")

            self.mongo_client = pymongo.MongoClient("mongo")
            print(f"Opened MongoClient: {self.mongo_client}")

            return True

        def process(self, row):
            print(f"Processing row: {row}")

            as_dict = row.asDict()
            print(f"Inserting row.asDict(): {as_dict}")

            id = self.mongo_client.agile_data_science.flight_delay_classification_response.insert_one(
                as_dict)
            print(f"Inserted row, got ID: {id.inserted_id}")

            self.mongo_client.close()

            return True

        def close(self, error):
            print("Closed with error: %s" % str(error))

            return True

    query = final_predictions.writeStream.foreach(MongoWriter()).start()

    query.awaitTermination()
                pred_json = {
                    "pred": pred.tolist(),
                    "target": target.tolist(),
                    "dist_error": dist_error,
                    'timeAtServer': timeAtServer,
                    'aircraft': aircraft
                }

                producer.send(topic=kafka_config['topics'][1], value=pred_json)

if isinstance(model_params, list):

    schema_fields = types.StructType([
        types.StructField(
            'timeAtServer',
            types.StructType([types.StructField("0", types.FloatType())])),
        types.StructField(
            'aircraft',
            types.StructType([types.StructField('0', types.IntegerType())]))
    ])

    for field in norm_params['input_features'] + norm_params['target']:
        if 'latitude' in field or 'longitude' in field or 'height_' in field:
            schema_fields.add(
                types.StructField(
                    field,
                    types.StructType(
                        [types.StructField('0', types.DoubleType())])))
        elif 'Altitude' in field or 'diff_' in field:
            schema_fields.add(
                types.StructField(
Exemplo n.º 17
0
import pyspark.sql.types as T
import pyspark.sql.functions as F
from operator import itemgetter as ig
import requests
import json
from alphareader import AlphaReader
import pyarrow as pa

mapper = {
    "long": T.LongType(),
    "string": T.StringType(),
    "int": T.IntegerType(),
    "boolean": T.BooleanType(),
    "double": T.DoubleType(),
    "float": T.FloatType(),
    "timestamp-millis": T.TimestampType()
}

def get_registry(url, entity='user_en', version='latest'):
    '''http://server:port/api/v1/schemaregistry/schemas/{entity}/versions/{version}'''
    return json.loads(
        requests
        .get(url.format(entity, version))
        .json().get("schemaText")
    )

def get_field(name, data_type, nullable):
    if isinstance(data_type, str):
        return T.StructField(name, ig(data_type)(mapper), bool(nullable))
    try:
        return get_field(name, ig(1)(data_type), bool(nullable))
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

# writeLegacyFormat is to make a spark decimal type works with hive decimal type.
spark = SparkSession.builder\
.config("spark.sql.parquet.writeLegacyFormat",True)\
.enableHiveSupport().getOrCreate()

df = spark.read.csv('hdfs://hive-namenode:8020/user/sqoop/restaurant_detail/part-m-00000', header=False)
rename = {
    '_c0' : 'id',
    '_c1' : 'restaurant_name',
    '_c2' : 'category',
    '_c3' : 'estimated_cooking_time',
    '_c4' : 'latitude',
    '_c5' : 'longitude',
}
df = df.toDF(*[rename[c] for c in df.columns])
df = df.withColumn('estimated_cooking_time', F.col('estimated_cooking_time').cast(T.FloatType()))
df = df.withColumn('latitude', F.col('latitude').cast(T.DecimalType(11,8)))
df = df.withColumn('longitude', F.col('longitude').cast(T.DecimalType(11,8)))
df = df.withColumn('dt', F.lit("latest"))
df.write.parquet('hdfs://hive-namenode:8020/user/spark/transformed_restaurant_detail', partitionBy='dt', mode='overwrite')
Exemplo n.º 19
0
def preprocess_kmer_file(cancer_kmers, cancer_sample, drop_cols,
                         expression_fields, jct_col, index_name, libsize_c,
                         cross_junction):
    ''' Preprocess cancer samples
    - Make kmers unique
    - Filter kmers on junction status
    - Normalize
    Parameters:
    ----------
    cancer_kmers: cancer kmer matrix
    cancer_sample: associated cancer ID
    drop_cols: colums to be dropped
    expression_fields: list of segment and junction expression column names
    jct_col: junction status column name
    index_name: kmer column name
    libsize_c: libsize matrix for cancer samples
    cross_junction: Information to filter on juction status. None (both, no filtering), True (junction), False (non junction)
    Returns
    --------
    cancer_kmers: cancer kmers matrix,
    cancer_path_tmp: path of renamed temporary file
    jct_type: string indicating which junction filtering has been performed
    '''
    def collapse_values(value):
        return max([
            np.float(i) if i != 'nan' else 0.0 for i in value.split('/')
        ])  # np.nanmax not supported

    # Filter on juction status
    if cross_junction == 1:
        cancer_kmers = cancer_kmers.filter("{} == True".format(jct_col))
    elif cross_junction == 0:
        cancer_kmers = cancer_kmers.filter("{} == False".format(jct_col))

    # Drop junction column
    for drop_col in drop_cols:
        cancer_kmers = cancer_kmers.drop(sf.col(drop_col))
    logging.info("Collapse kmer horizontal")

    # Remove the '/' in the expression data (kmers duplicate within a gene have 'expression1/expression2' format
    local_max = sf.udf(collapse_values, st.FloatType())
    for name_ in expression_fields:
        cancer_kmers = cancer_kmers.withColumn(name_, local_max(name_))

    # Make kmers unique (Take max expression)
    logging.info("Collapse kmer vertical")
    cancer_kmers = cancer_kmers.withColumn(
        jct_col,
        sf.col(jct_col).cast("boolean").cast("int"))
    exprs = [
        sf.max(sf.col(name_)).alias(name_)
        for name_ in cancer_kmers.schema.names if name_ != index_name
    ]
    cancer_kmers = cancer_kmers.groupBy(index_name).agg(*exprs)

    # Remove kmers unexpressed (both junction and segment expression null)
    cancer_kmers = cancer_kmers.withColumn(
        'allnull', sum(cancer_kmers[name_] for name_ in expression_fields))
    cancer_kmers = cancer_kmers.filter(sf.col("allnull") > 0.0)
    cancer_kmers = cancer_kmers.drop("allnull")

    # Normalize by library size
    if libsize_c is not None:
        for name_ in expression_fields:
            cancer_kmers = cancer_kmers.withColumn(
                name_,
                sf.round(
                    cancer_kmers[name_] /
                    libsize_c.loc[cancer_sample, "libsize_75percent"], 2))
    else:
        for name_ in expression_fields:
            cancer_kmers = cancer_kmers.withColumn(
                name_, sf.round(cancer_kmers[name_], 2))

    return cancer_kmers