# sc = SparkContext("Api_Spark_DF", "local[*]") # funtion to save to RDBMS def save_to_rdbms(tb_df, table, user, password, driver_format, url): tb_df.write \ .format(driver_format) \ .mode('overwrite') \ .option('url', url) \ .option('user', user) \ .option('password', password) \ .option('dbtable', table) \ .save() Schema = ty.StructType() \ .add("date", ty.FloatType()) \ .add("close", ty.FloatType()) \ # Schema = ty.StructType() \ # .add("Date", ty.MapType(ty.StringType(), ty.StructType() \ # .add("open", ty.FloatType()) \ # .add("high", ty.FloatType()) \ # .add("low", ty.FloatType()) \ # .add("close", ty.FloatType()) \ # .add("volume", ty.IntegerType()))) # data_schema = [StructField('open',FloatType(),True), # StructField('high',FloatType(),True), # StructField('low',FloatType(),True), # StructField('close',FloatType(),True), # StructField('volume',IntegerType(),True)]
def parse(path_to_dir): if 'DAS5' in os.environ: # If we want to execute it on the DAS-5 super computer print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] + ".ib.cluster")) spark = SparkSession.builder \ .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \ .appName("WTA parser") \ .config("spark.executor.memory", "28G") \ .config("spark.executor.cores", "8") \ .config("spark.executor.instances", "10") \ .config("spark.driver.memory", "40G") \ .getOrCreate() else: findspark.init(spark_home="<path_to_spark>") spark = SparkSession.builder \ .master("local[8]") \ .appName("WTA parser") \ .config("spark.executor.memory", "20G") \ .config("spark.driver.memory", "8G") \ .getOrCreate() # Convert times which are in microseconds and do not fit in a long to milliseconds convert_micro_to_milliseconds = F.udf(lambda x: x / 1000) if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())): print("######\n Start parsing TaskState\n ######") task_usage_df = spark.read.format('com.databricks.spark.csv').options(mode="FAILFAST", inferschema="true").load( os.path.join(path_to_dir, 'task_usage', '*.csv')) # task_usage_df = spark.read.format('com.databricks.spark.csv').options(mode="FAILFAST", inferschema="true").load( # 'fake_task_usage.csv') oldColumns = task_usage_df.schema.names newColumns = ["ts_start", "ts_end", "workflow_id", "id", "resource_id", "cpu_rate", "memory_consumption", "assigned_memory_usage", "unmapped_page_cache", "total_page_cache", "max_memory_usage", "mean_disk_io_time", "mean_local_disk_space_usage", "max_cpu_rate", "max_disk_io_time", "cycles_per_instruction", "memory_accesses_per_instruction", "sample_portion", "aggregation_type", "sampled_cpu_usage", ] task_usage_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), task_usage_df) # Drop columns with too low level details task_usage_df = task_usage_df.drop('memory_accesses_per_instruction') task_usage_df = task_usage_df.drop('cycles_per_instruction') task_usage_df = task_usage_df.drop('unmapped_page_cache') task_usage_df = task_usage_df.drop('total_page_cache') # Conver the timestamps from micro to milliseconds and cast them to long. task_usage_df = task_usage_df.withColumn('ts_start', convert_micro_to_milliseconds(F.col('ts_start'))) task_usage_df = task_usage_df.withColumn('ts_start', F.col('ts_start').cast(T.LongType())) task_usage_df = task_usage_df.withColumn('ts_end', convert_micro_to_milliseconds(F.col('ts_end'))) task_usage_df = task_usage_df.withColumn('ts_end', F.col('ts_end').cast(T.LongType())) # Some fields have weird symbols in them, clean those. truncate_at_lt_symbol_udf = F.udf(lambda x: re.sub('[^0-9.eE\-+]', '', str(x)) if x is not None else x) task_usage_df = task_usage_df.withColumn('workflow_id', truncate_at_lt_symbol_udf(F.col('workflow_id'))) task_usage_df = task_usage_df.withColumn('max_cpu_rate', truncate_at_lt_symbol_udf(F.col('max_cpu_rate'))) # Now that the columns have been sanitized, cast them to the right type task_usage_df = task_usage_df.withColumn('workflow_id', F.col('workflow_id').cast(T.LongType())) task_usage_df = task_usage_df.withColumn('max_cpu_rate', F.col('max_cpu_rate').cast(T.FloatType())) task_usage_df.write.parquet(os.path.join(TARGET_DIR, TaskState.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing TaskState\n ######") if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())): if 'task_usage_df' not in locals(): task_usage_df = spark.read.parquet(os.path.join(TARGET_DIR, TaskState.output_path())) print("######\n Start parsing Tasks\n ######") task_df = spark.read.format('com.databricks.spark.csv').options(inferschema="true", mode="FAILFAST", parserLib="univocity").load( os.path.join(path_to_dir, 'task_events', '*.csv')) oldColumns = task_df.schema.names newColumns = ["ts_submit", "missing_info", "workflow_id", "id", "resource_id", "event_type", "user_id", "scheduler", "nfrs", "resources_requested", "memory_requested", "disk_space_request", "machine_restrictions", ] task_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), task_df) task_df = task_df.withColumn('ts_submit', convert_micro_to_milliseconds(F.col('ts_submit'))) task_df = task_df.withColumn('ts_submit', F.col('ts_submit').cast(T.LongType())) # Filter tasks that never reached completion task_df.createOrReplaceTempView("task_table") task_df = spark.sql("""WITH filtered_tasks AS ( SELECT DISTINCT t1.workflow_id AS workflow_id, t1.id AS id FROM task_table t1 WHERE t1.event_type IN(0, 1, 4) group by t1.workflow_id, t1.id having count(distinct event_type) = 3 ) SELECT t.* FROM task_table t INNER JOIN filtered_tasks f ON t.id = f.id AND t.workflow_id = f.workflow_id""") task_aggregation_structtype = T.StructType([ T.StructField("workflow_id", T.LongType(), True), T.StructField("id", T.LongType(), True), T.StructField("type", T.StringType(), True), T.StructField("ts_submit", T.LongType(), True), T.StructField("submission_site", T.LongType(), True), T.StructField("runtime", T.LongType(), True), T.StructField("resource_type", T.StringType(), True), T.StructField("resource_amount_requested", T.DoubleType(), True), T.StructField("parents", T.ArrayType(T.LongType()), True), T.StructField("children", T.ArrayType(T.LongType()), True), T.StructField("user_id", T.LongType(), True), T.StructField("group_id", T.LongType(), True), T.StructField("nfrs", T.StringType(), True), T.StructField("wait_time", T.LongType(), True), T.StructField("params", T.StringType(), True), T.StructField("memory_requested", T.DoubleType(), True), T.StructField("network_io_time", T.DoubleType(), True), T.StructField("disk_space_requested", T.DoubleType(), True), T.StructField("energy_consumption", T.DoubleType(), True), T.StructField("resource_used", T.StringType(), True), ]) # Compute based on the event type @F.pandas_udf(returnType=task_aggregation_structtype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_aggregated_task_usage_metrics(df): def get_first_non_value_in_column(column_name): s = df[column_name] idx = s.first_valid_index() return s.loc[idx] if idx is not None else None task_workflow_id = get_first_non_value_in_column("workflow_id") task_id = get_first_non_value_in_column("id") task_submit_time = df[df['event_type'] == 0]['ts_submit'].min(skipna=True) task_start_time = df[df['event_type'] == 1]['ts_submit'].min(skipna=True) task_finish_time = df[df['event_type'] == 4]['ts_submit'].max(skipna=True) if None in [task_start_time, task_submit_time, task_finish_time]: return None task_resource_request = df['resources_requested'].max(skipna=True) task_memory_request = df['memory_requested'].max(skipna=True) task_priority = df['nfrs'].max(skipna=True) task_disk_space_requested = df['disk_space_request'].max(skipna=True) task_machine_id_list = df.resource_id.unique() task_waittime = int(task_start_time) - int(task_submit_time) task_runtime = int(task_finish_time) - int(task_start_time) def default(o): if isinstance(o, np.int64): return int(o) data_dict = { "workflow_id": task_workflow_id, "id": task_id, "type": "", # Unknown "ts_submit": task_submit_time, "submission_site": -1, # Unknown "runtime": task_runtime, "resource_type": "core", # Fields are called CPU, but they are core count (see Google documentation) "resource_amount_requested": task_resource_request, "parents": [], "children": [], "user_id": mmh3.hash64(get_first_non_value_in_column("user_id"))[0], "group_id": -1, "nfrs": json.dumps({"priority": task_priority}, default=default), "wait_time": task_waittime, "params": "{}", "memory_requested": task_memory_request, "network_io_time": -1, # Unknown "disk_space_requested": task_disk_space_requested, "energy_consumption": -1, # Unknown "resource_used": json.dumps(task_machine_id_list, default=default), } return pd.DataFrame(data_dict, index=[0]) task_df = task_df.groupBy(["workflow_id", "id"]).apply(compute_aggregated_task_usage_metrics) task_df.explain(True) # Now add disk IO time - This cannot be done in the previous Pandas UDF function as # accessing another dataframe in the apply function is not allowed disk_io_structtype = T.StructType([ T.StructField("workflow_id", T.LongType(), True), T.StructField("id", T.LongType(), True), T.StructField("disk_io_time", T.DoubleType(), True), ]) @F.pandas_udf(returnType=disk_io_structtype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_disk_io_time(df): def get_first_non_value_in_column(column_name): s = df[column_name] idx = s.first_valid_index() return s.loc[idx] if idx is not None else None task_workflow_id = get_first_non_value_in_column("workflow_id") task_id = get_first_non_value_in_column("id") disk_io_time = ((df['ts_end'] - df['ts_start']) * df['mean_disk_io_time']).sum(skipna=True) / 1000 data_dict = { "workflow_id": task_workflow_id, "id": task_id, "disk_io_time": disk_io_time } return pd.DataFrame(data_dict, index=[0]) disk_io_df = task_usage_df.select(['workflow_id', 'id', 'mean_disk_io_time', 'ts_end', 'ts_start']).groupBy( ["workflow_id", "id"]).apply(compute_disk_io_time) disk_io_df.explain(True) join_condition = (task_df.workflow_id == disk_io_df.workflow_id) & (task_df.id == disk_io_df.id) task_df = task_df.join(disk_io_df, ["workflow_id", "id"]) task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing Tasks\n ######") else: task_df = spark.read.parquet(os.path.join(TARGET_DIR, Task.output_path())) if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())): print("######\n Start parsing Resource\n ######") # Parse the machine information in the traces, these should match with the resource_ids in task_usage resources_structtype = T.StructType([ # Using StringTypes as we drop those columns T.StructField("time", T.StringType(), False), T.StructField("id", T.LongType(), False), T.StructField("attribute_name", T.StringType(), False), T.StructField("attribute_value", T.StringType(), False), T.StructField("attribute_deleted", T.StringType(), False), ]) resource_df = spark.read.format('com.databricks.spark.csv').schema(resources_structtype).options( mode="FAILFAST").load(os.path.join(path_to_dir, 'machine_attributes', '*.csv')) resource_df = resource_df.select(["id"]) # Only keep the ID, the rest we do not need. # Since the information in the traces is completely opaque, we use the educated guess from Amvrosiadis et al. # in their ATC 2018 article. resource_df = resource_df.withColumn('type', F.lit("core")) resource_df = resource_df.withColumn('num_resources', F.lit(8)) resource_df = resource_df.withColumn('proc_model', F.lit("AMD Opteron Barcelona")) resource_df = resource_df.withColumn('memory', F.lit(-1)) resource_df = resource_df.withColumn('disk_space', F.lit(-1)) resource_df = resource_df.withColumn('network', F.lit(-1)) resource_df = resource_df.withColumn('os', F.lit("")) resource_df = resource_df.withColumn('details', F.lit("{}")) # Write the resource_df to the specified location resource_df.write.parquet(os.path.join(TARGET_DIR, Resource.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing Resource\n ######") if not os.path.exists(os.path.join(TARGET_DIR, ResourceState.output_path())): print("######\n Start parsing ResourceState\n ######") resource_events_structtype = T.StructType([ T.StructField("timestamp", T.DecimalType(20, 0), False), T.StructField("machine_id", T.LongType(), False), T.StructField("event_type", T.IntegerType(), False), T.StructField("platform_id", T.StringType(), False), T.StructField("available_resources", T.FloatType(), False), T.StructField("available_memory", T.FloatType(), False), ]) resource_event_df = spark.read.format('com.databricks.spark.csv').schema(resource_events_structtype).options( mode="FAILFAST").load(os.path.join(path_to_dir, 'machine_events', '*.csv')) resource_event_df = resource_event_df.withColumn('timestamp', convert_micro_to_milliseconds(F.col('timestamp'))) resource_event_df = resource_event_df.withColumn('timestamp', F.col('timestamp').cast(T.LongType())) resource_event_df = resource_event_df.withColumn('available_disk_space', F.lit(-1)) resource_event_df = resource_event_df.withColumn('available_disk_io_bandwidth', F.lit(-1)) resource_event_df = resource_event_df.withColumn('available_network_bandwidth', F.lit(-1)) resource_event_df = resource_event_df.withColumn('average_load_1_minute', F.lit(-1)) resource_event_df = resource_event_df.withColumn('average_load_5_minute', F.lit(-1)) resource_event_df = resource_event_df.withColumn('average_load_15_minute', F.lit(-1)) # Write the resource_df to the specified location resource_event_df.write.parquet(os.path.join(TARGET_DIR, ResourceState.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing ResourceState\n ######") if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())): print("######\n Start parsing Workflows\n ######") workflow_structype = T.StructType([ T.StructField("id", T.LongType(), False), T.StructField("ts_submit", T.LongType(), False), T.StructField("task_count", T.IntegerType(), False), T.StructField("critical_path_length", T.LongType(), False), T.StructField("critical_path_task_count", T.IntegerType(), False), T.StructField("approx_max_concurrent_tasks", T.IntegerType(), False), T.StructField("nfrs", T.StringType(), False), T.StructField("scheduler", T.StringType(), False), T.StructField("total_resources", T.DoubleType(), False), T.StructField("total_memory_usage", T.DoubleType(), False), T.StructField("total_network_usage", T.LongType(), False), T.StructField("total_disk_space_usage", T.LongType(), False), T.StructField("total_energy_consumption", T.LongType(), False), ]) @F.pandas_udf(returnType=workflow_structype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_workflow_stats(df): id = df['workflow_id'].iloc[0] ts_submit = df['ts_submit'].min() task_count = len(df) critical_path_length = -1 # We do not know the task dependencies, so -1 critical_path_task_count = -1 approx_max_concurrent_tasks = -1 nfrs = "{}" scheduler = "" total_resources = df['resource_amount_requested'].sum() # TODO or assigned? total_memory_usage = df['memory_requested'].sum() # TODO or consumption, or assigned? total_network_usage = -1 total_disk_space_usage = -1 total_energy_consumption = -1 data_dict = { "id": id, "ts_submit": ts_submit, 'task_count': task_count, 'critical_path_length': critical_path_length, 'critical_path_task_count': critical_path_task_count, 'approx_max_concurrent_tasks': approx_max_concurrent_tasks, 'nfrs': nfrs, 'scheduler': scheduler, 'total_resources': total_resources, 'total_memory_usage': total_memory_usage, 'total_network_usage': total_network_usage, 'total_disk_space_usage': total_disk_space_usage, 'total_energy_consumption': total_energy_consumption } return pd.DataFrame(data_dict, index=[0]) # Create and write the workflow dataframe workflow_df = task_df.groupBy('workflow_id').apply(compute_workflow_stats) workflow_df.write.parquet(os.path.join(TARGET_DIR, Workflow.output_path()), mode="overwrite", compression="snappy") print("######\n Done parsing Workflows\n ######") print("######\n Start parsing Workload\n ######") json_dict = Workload.get_json_dict_from_spark_task_dataframe(task_df, domain="Industrial", start_date="2011-05-01", end_date="2011-05-30", authors=["Google"]) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) file.write(json.dumps(json_dict, default=default)) print("######\n Done parsing Workload\n ######")
REG = 0.1 lr = LogisticRegression(featuresCol="features", labelCol='toxic', regParam=REG) tfidf.show(5) lrModel = lr.fit(tfidf.limit(5000)) res_train = lrModel.transform(tfidf) res_train.select("id", "toxic", "probability", "prediction").show(20) res_train.show(5) extract_prob = F.udf(lambda x: float(x[1]), T.FloatType()) (res_train.withColumn("proba", extract_prob("probability")).select( "proba", "prediction").show()) test_tokens = tokenizer.transform(test) test_tf = hashingTF.transform(test_tokens) test_tfidf = idfModel.transform(test_tf) test_res = test.select('id') test_res.head() test_probs = [] for col in out_cols: print(col) lr = LogisticRegression(featuresCol="features", labelCol=col, regParam=REG)
import sys from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('first Spark app').getOrCreate() assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.1' # make sure we have Spark 2.1+ schema = types.StructType([ types.StructField('id', types.IntegerType(), False), types.StructField('x', types.FloatType(), False), types.StructField('y', types.FloatType(), False), types.StructField('z', types.FloatType(), False), ]) def main(in_directory, out_directory): # Read the data from the JSON files xyz = spark.read.json(in_directory, schema=schema) #xyz.show(); return # Create a DF with what we need: x, (soon y,) and id%10 which we'll aggregate by. with_bins = xyz.select( xyz['x'], # TODO: also the y values xyz['y'], (xyz['id'] % 10).alias('bin'), ) #with_bins.show(); return # Aggregate by the bin number.
def get_common_spark_testing_client(data_directory, connect): pytest.importorskip('pyspark') import pyspark.sql.types as pt from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() _spark_testing_client = connect(spark) s = _spark_testing_client._session df_functional_alltypes = s.read.csv( path=str(data_directory / 'functional_alltypes.csv'), schema=pt.StructType([ pt.StructField('index', pt.IntegerType(), True), pt.StructField('Unnamed: 0', pt.IntegerType(), True), pt.StructField('id', pt.IntegerType(), True), # cast below, Spark can't read 0/1 as bool pt.StructField('bool_col', pt.ByteType(), True), pt.StructField('tinyint_col', pt.ByteType(), True), pt.StructField('smallint_col', pt.ShortType(), True), pt.StructField('int_col', pt.IntegerType(), True), pt.StructField('bigint_col', pt.LongType(), True), pt.StructField('float_col', pt.FloatType(), True), pt.StructField('double_col', pt.DoubleType(), True), pt.StructField('date_string_col', pt.StringType(), True), pt.StructField('string_col', pt.StringType(), True), pt.StructField('timestamp_col', pt.TimestampType(), True), pt.StructField('year', pt.IntegerType(), True), pt.StructField('month', pt.IntegerType(), True), ]), mode='FAILFAST', header=True, ) df_functional_alltypes = df_functional_alltypes.withColumn( "bool_col", df_functional_alltypes["bool_col"].cast("boolean")) df_functional_alltypes.createOrReplaceTempView('functional_alltypes') df_batting = s.read.csv( path=str(data_directory / 'batting.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('stint', pt.IntegerType(), True), pt.StructField('teamID', pt.StringType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('G', pt.IntegerType(), True), pt.StructField('AB', pt.DoubleType(), True), pt.StructField('R', pt.DoubleType(), True), pt.StructField('H', pt.DoubleType(), True), pt.StructField('X2B', pt.DoubleType(), True), pt.StructField('X3B', pt.DoubleType(), True), pt.StructField('HR', pt.DoubleType(), True), pt.StructField('RBI', pt.DoubleType(), True), pt.StructField('SB', pt.DoubleType(), True), pt.StructField('CS', pt.DoubleType(), True), pt.StructField('BB', pt.DoubleType(), True), pt.StructField('SO', pt.DoubleType(), True), pt.StructField('IBB', pt.DoubleType(), True), pt.StructField('HBP', pt.DoubleType(), True), pt.StructField('SH', pt.DoubleType(), True), pt.StructField('SF', pt.DoubleType(), True), pt.StructField('GIDP', pt.DoubleType(), True), ]), header=True, ) df_batting.createOrReplaceTempView('batting') df_awards_players = s.read.csv( path=str(data_directory / 'awards_players.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('awardID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('tie', pt.StringType(), True), pt.StructField('notes', pt.StringType(), True), ]), header=True, ) df_awards_players.createOrReplaceTempView('awards_players') df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar']) df_simple.createOrReplaceTempView('simple') df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col']) df_struct.createOrReplaceTempView('struct') df_nested_types = s.createDataFrame( [([1, 2], [[3, 4], [5, 6]], { 'a': [[2, 4], [3, 5]] })], [ 'list_of_ints', 'list_of_list_of_ints', 'map_string_list_of_list_of_ints', ], ) df_nested_types.createOrReplaceTempView('nested_types') df_complicated = s.createDataFrame([({ (1, 3): [[2, 4], [3, 5]] }, )], ['map_tuple_list_of_list_of_ints']) df_complicated.createOrReplaceTempView('complicated') df_udf = s.createDataFrame( [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')], ['a', 'b', 'c', 'key'], ) df_udf.createOrReplaceTempView('udf') df_udf_nan = s.createDataFrame( pd.DataFrame({ 'a': np.arange(10, dtype=float), 'b': [3.0, np.NaN] * 5, 'key': list('ddeefffggh'), })) df_udf_nan.createOrReplaceTempView('udf_nan') df_udf_null = s.createDataFrame( [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i]) for i in range(10)], ['a', 'b', 'key'], ) df_udf_null.createOrReplaceTempView('udf_null') df_udf_random = s.createDataFrame( pd.DataFrame({ 'a': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'key': list('ddeefff'), })) df_udf_random.createOrReplaceTempView('udf_random') return _spark_testing_client
def as_spark_type(tpe: Union[str, type, Dtype], *, raise_error: bool = True) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ if isinstance(tpe, np.dtype) and tpe == np.dtype("object"): pass # ArrayType elif tpe in (np.ndarray,): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list): # type: ignore element_type = as_spark_type(tpe.__args__[0], raise_error=raise_error) # type: ignore if element_type is None: return None return types.ArrayType(element_type) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date,): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal,): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int, np.int64, "int", "int64", "long"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampType() # categorical types elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str) and type == "category"): return types.LongType() # extension types elif extension_dtypes_available: # IntegralType if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str) and tpe == "Int8"): return types.ByteType() elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str) and tpe == "Int16"): return types.ShortType() elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str) and tpe == "Int32"): return types.IntegerType() elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str) and tpe == "Int64"): return types.LongType() if extension_object_dtypes_available: # BooleanType if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str) and tpe == "boolean"): return types.BooleanType() # StringType elif isinstance(tpe, StringDtype) or (isinstance(tpe, str) and tpe == "string"): return types.StringType() if extension_float_dtypes_available: # FractionalType if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str) and tpe == "Float32"): return types.FloatType() elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and tpe == "Float64"): return types.DoubleType() if raise_error: raise TypeError("Type %s was not understood." % tpe) else: return None
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('weather prediction').getOrCreate() spark.sparkContext.setLogLevel('WARN') assert spark.version >= '2.3' # make sure we have Spark 2.3+ from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, VectorAssembler, SQLTransformer from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.evaluation import RegressionEvaluator tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.DoubleType()), ]) def main(inputs, model_file): data = spark.read.csv(inputs, schema=tmax_schema) data.registerTempTable('yesterday') #wthr_query = """SELECT dayofyear(date) as dayofyr, latitude, longitude, elevation,tmax FROM __THIS__""" wthr_query = """SELECT dayofyear(today.date) as dayofyr,today.latitude, today.longitude, today.elevation, today.tmax, yesterday.tmax as yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache()
def getPreprocessingDataframe(df, nlp, version='dev'): if version == 'dev': df = df.orderBy(rand()) df2 = df.filter(df.Timestamp.isNotNull()) df2 = df2.limit(500) else: df = df.orderBy(rand()) df2 = df.filter(df.Timestamp.isNotNull()) # convert timestamp to the right format timeStampPreCleaning = udf( lambda x: str(x) + " 2020" if len(x) < 8 else x.replace(",", ""), StringType()) df2 = df2.withColumn("Timestamp", timeStampPreCleaning("Timestamp")) # StirngToDateType df3 = df2.withColumn("TimeStampDateType", F.to_date(F.col("Timestamp"), "MMM dd yyyy")) # drop null value rows which timestamp columns is not in the standard format. df3 = df3.filter(df3.Text.isNotNull()) df3 = df3.filter(df3.TimeStampDateType.isNotNull()) df3 = df3.withColumn("Year", F.year(df3.TimeStampDateType)) df3 = df3.withColumn("Month", F.month(df3.TimeStampDateType)) df3 = df3.withColumn("Qurter", F.quarter(df3.TimeStampDateType)) # fill null with 0 and convert unit to the right numbers. cols = ["Comments", "Likes", "Retweets"] df3 = df3.fillna("0", subset=cols) # apply the transform_number udf transformNumber = udf(lambda z: transform_number(z), T.IntegerType()) df3 = df3.withColumn("Comments", transformNumber("Comments")) df3 = df3.withColumn("Likes", transformNumber("Likes")) df3 = df3.withColumn("Retweets", transformNumber("Retweets")) ### check logNormal = udf(lambda x: int(round(np.log2(x + 1))) + 1, T.IntegerType()) df3 = df3.withColumn("Likes_log", logNormal("Likes")) df3 = df3.withColumn("Retweets_log", logNormal("Retweets")) # df3 = df3.filter(df3.Likes_log.isNotNull()) df3 = df3.filter(df3.Retweets_log.isNotNull()) extractKeywordFromQueries = udf(lambda x: extractkeyword(x)) df3 = df3.filter(df3.Page_URL.isNotNull()) df3 = df3.withColumn("Keyword", extractKeywordFromQueries("Page_URL")) df3 = df3.filter(df3.Keyword.isNotNull()) keywordToCategory2 = udf(lambda x: getCategory2(x), StringType()) df3 = df3.withColumn("Category2", keywordToCategory2("Keyword")) # NER Model # could be empty list, nerExtraction = udf(lambda z: ner_extraction(z, nlp), T.ArrayType(StringType())) df3 = df3.withColumn("All_phrases", nerExtraction("Text")) df3 = df3.filter(df3.All_phrases.isNotNull()) checkEmpty = udf(lambda x: checkempty(x), T.IntegerType()) df3 = df3.withColumn('CheckEmpty', checkEmpty('All_phrases')) df3 = df3.filter(df3.CheckEmpty.isNotNull()) df3 = df3.filter(df3.CheckEmpty != int(1)) sentiment = VaderSentiment() vader_sentiment = udf(sentiment.score, T.FloatType()) df3 = df3.withColumn("Sentiment", vader_sentiment('Text')) weighted_phrases_calculate = udf(lambda x, y: y * (int(x) + 1), T.ArrayType(StringType())) df3 = df3.withColumn( "Weighted_phrases", weighted_phrases_calculate("Retweets_log", "All_phrases")) # cols = ['Sentiment','All_phrases','Retweets_log','Weighted_phrases','Year','Month','Keyword'] cols = ["Weighted_phrases", "Year", "Month", "Keyword", "Category2"] weighted_phrases_calculate = udf(lambda x, y: y * (int(x) + 1), T.FloatType()) # get the weighted sentiments for each tweets. df3 = df3.withColumn("Weighted_Sentiment", weighted_phrases_calculate("Likes_log", "Sentiment")) return df3
def cast_columns(df, cols): for col in cols: df = df.withColumn(col, F.coalesce(df[col].cast(T.FloatType()), F.lit(0.0))) return df
from pyspark.sql import SparkSession, types movie_schema = types.StructType([ types.StructField('imdb_id', types.StringType(), True), types.StructField('title', types.StringType(), True), types.StructField('year', types.StringType(), True), types.StructField('genre', types.StringType(), True), types.StructField('country', types.StringType(), True), types.StructField('language', types.StringType(), True), types.StructField('imdb_score', types.FloatType(), True), types.StructField('meta_score', types.IntegerType(), True), types.StructField('votes', types.IntegerType(), True), types.StructField('director', types.StringType(), True), types.StructField('stars', types.StringType(), True), types.StructField('description', types.StringType(), True), types.StructField('image', types.StringType(), True), types.StructField('runtimemins', types.IntegerType(), True) ]) tvshow_schema = types.StructType([ types.StructField('imdb_id', types.StringType(), True), types.StructField('title', types.StringType(), True), types.StructField('genre', types.StringType(), True), types.StructField('country', types.StringType(), True), types.StructField('language', types.StringType(), True), types.StructField('imdb_score', types.FloatType(), True), types.StructField('meta_score', types.IntegerType(), True), types.StructField('votes', types.IntegerType(), True), types.StructField('director', types.StringType(), True), types.StructField('stars', types.StringType(), True), types.StructField('description', types.StringType(), True),
udf_morphy = functions.udf(py_morphy, returnType=types.ArrayType(types.StringType())) def sentiment_score(text): list_text = text.split('.') s = SentimentIntensityAnalyzer() list_scores = [] for sentence in list_text: list_scores.append(s.polarity_scores(sentence)['compound']) return list_scores udf_sentiment_score = functions.udf(sentiment_score, returnType=types.ArrayType( types.FloatType())) def main(topic): # 1. Load Data, Combine keywords, tweet_urls by news_url, Add id messages = spark.readStream.format('kafka') \ .option('kafka.bootstrap.servers', 'localhost:9092') \ .option('subscribe', topic)\ .option('failOnDataLoss', 'false')\ .option('auto.offset.reset', 'earliest')\ .load() values = messages.select(messages['value'].cast('string')) words = values.select( functions.explode(functions.split(values.value, ';')).alias("words")) data = words.withColumn('text', functions.split('words', ',')).select('text')
def as_spark_type(tpe: Union[str, type, Dtype], *, raise_error: bool = True, prefer_timestamp_ntz: bool = False) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+ if sys.version_info >= (3, 8) and LooseVersion( np.__version__) >= LooseVersion("1.21"): if (hasattr(tpe, "__origin__") and tpe.__origin__ is np.ndarray # type: ignore[union-attr] and hasattr(tpe, "__args__") and len(tpe.__args__) > 1 # type: ignore[union-attr] ): # numpy.typing.NDArray return types.ArrayType( as_spark_type( tpe.__args__[1].__args__[0], raise_error=raise_error # type: ignore[union-attr] )) if isinstance(tpe, np.dtype) and tpe == np.dtype("object"): pass # ArrayType elif tpe in (np.ndarray, ): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass( tpe.__origin__, list # type: ignore[union-attr] ): element_type = as_spark_type( tpe.__args__[0], raise_error=raise_error # type: ignore[union-attr] ) if element_type is None: return None return types.ArrayType(element_type) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool_, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date, ): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal, ): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float_, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int64, "int", "int64", "long"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType or TimestampNTZType if timezone is not specified. elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampNTZType( ) if prefer_timestamp_ntz else types.TimestampType() # DayTimeIntervalType elif tpe in (datetime.timedelta, np.timedelta64, "timedelta64[ns]"): return types.DayTimeIntervalType() # categorical types elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str) and type == "category"): return types.LongType() # extension types elif extension_dtypes_available: # IntegralType if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str) and tpe == "Int8"): return types.ByteType() elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str) and tpe == "Int16"): return types.ShortType() elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str) and tpe == "Int32"): return types.IntegerType() elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str) and tpe == "Int64"): return types.LongType() if extension_object_dtypes_available: # BooleanType if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str) and tpe == "boolean"): return types.BooleanType() # StringType elif isinstance(tpe, StringDtype) or (isinstance(tpe, str) and tpe == "string"): return types.StringType() if extension_float_dtypes_available: # FractionalType if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str) and tpe == "Float32"): return types.FloatType() elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and tpe == "Float64"): return types.DoubleType() if raise_error: raise TypeError("Type %s was not understood." % tpe) else: return None
import sys from pyspark.sql import SparkSession, functions as f, types from pyspark.sql.functions import monotonically_increasing_id spark = SparkSession.builder.appName('amenity data cleaning').getOrCreate() spark.sparkContext.setLogLevel('WARN') assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ assert spark.version >= '2.3' # make sure we have Spark 2.3+ ''' RUN: spark-submit code/amenities_data_cleaning.py data/amenities-vancouver.json.gz cleaned-data-amenities ''' schema = types.StructType([ types.StructField('amenity', types.StringType()), types.StructField('lat', types.FloatType()), types.StructField('lon', types.FloatType()), types.StructField('name', types.StringType()), types.StructField('tags', types.StringType()), types.StructField('timestamp', types.TimestampType()), ]) def main(inp, outp): data = spark.read.json(inp, schema=schema) #data.select('amenity').distinct().sort('amenity').show() # Interesting amenities amnt = [ 'Observation Platform', 'arts_centre', 'atm', 'atm;bank', 'bank', 'bar', 'bbq', 'bicycle_rental', 'biergarten', 'bistro', 'boat_rental', 'bureau_de_change', 'bus_station', 'cafe', 'car_rental', 'car_sharing',
types.StructField('UOM_ID', types.StringType(), True), types.StructField('SCALAR_FACTOR', types.StringType(), True), types.StructField('SCALAR_ID', types.StringType(), True), types.StructField('VECTOR', types.StringType(), True), types.StructField('COORDINATE', types.StringType(), True), types.StructField('VALUE', types.StringType(), True), types.StructField('STATUS', types.StringType(), True), types.StructField('SYMBOL', types.StringType(), True), types.StructField('TERMINATE', types.StringType(), True), types.StructField('DECIMALS', types.StringType(), True), ]) labource_charact_schema = types.StructType([ types.StructField('REF_DATE', types.StringType(), True), types.StructField('GEO', types.StringType(), True), types.StructField('Employment', types.FloatType(), True), types.StructField('Employment_rate', types.FloatType(), True), types.StructField('Full_time_employment', types.FloatType(), True), types.StructField('Labour_force', types.FloatType(), True), types.StructField('Part_time_employment', types.FloatType(), True), types.StructField('Participation_rate', types.FloatType(), True), types.StructField('Population', types.FloatType(), True), types.StructField('Unemployment', types.FloatType(), True), types.StructField('Unemployment_rate', types.FloatType(), True), ]) # dtype={"REF_DATE": str, "GEO": str, "DGUID":str , "Labour force characteristics":str, "Sex":str, "Age group":str, \ #"Statistics":str, "Data type":str, "UOM":str, "UOM_ID":int, "SCALAR_FACTOR":str, "SCALAR_ID":int, "VECTOR":str, "COORDINATE":str, "VALUE":str, "STATUS":str, \ #"SYMBOL":str, "TERMINATE":str, "DECIMALS":int} def download_extract_zip(url):
def main(base_path): spark = (SparkSession.builder.config( "spark.default.parallelism", 1).config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0").appName( APP_NAME).getOrCreate()) # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import ( RandomForestClassifier, RandomForestClassificationModel, ) random_forest_model_path = ( "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin". format(base_path)) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Messages look like: # # { # "Carrier": "DL", # "DayOfMonth": 25, # "DayOfWeek": 4, # "DayOfYear": 359, # "DepDelay": 10.0, # "Dest": "LAX", # "Distance": 2475.0, # "FlightDate": "2015-12-25", # "FlightNum": null, # "Origin": "JFK", # "Timestamp": "2019-10-31T00:19:47.633280", # "UUID": "af74b096-ecc7-4493-a79a-ebcdff699385" # } # # Process Prediction Requests from Kafka # message_df = (spark.readStream.format("kafka").option( "kafka.bootstrap.servers", BROKERS).option("subscribe", PREDICTION_TOPIC).option("startingOffsets", "earliest").load()) # Create a DataFrame out of the one-hot encoded RDD schema = T.StructType([ T.StructField("Carrier", T.StringType()), T.StructField("DayOfMonth", T.IntegerType()), T.StructField("DayOfWeek", T.IntegerType()), T.StructField("DayOfYear", T.IntegerType()), T.StructField("DepDelay", T.FloatType()), T.StructField("Dest", T.StringType()), T.StructField("Distance", T.FloatType()), T.StructField("FlightDate", T.StringType()), T.StructField("FlightNum", T.StringType()), T.StructField("Origin", T.StringType()), T.StructField("Timestamp", T.TimestampType()), T.StructField("UUID", T.StringType()), ]) prediction_requests_df = message_df.select( F.from_json(F.col("value").cast("string"), schema).alias("data")).select("data.*") # # Add a Route variable to replace FlightNum # prediction_requests_with_route = prediction_requests_df.withColumn( "Route", F.concat(prediction_requests_df.Origin, F.lit("-"), prediction_requests_df.Dest), ) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Drop the individual index columns index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = (predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability")) # Store the results to MongoDB class MongoWriter: def open(self, partition_id, epoch_id): print(f"Opened partition id: {partition_id}, epoch: {epoch_id}") self.mongo_client = pymongo.MongoClient("mongo") print(f"Opened MongoClient: {self.mongo_client}") return True def process(self, row): print(f"Processing row: {row}") as_dict = row.asDict() print(f"Inserting row.asDict(): {as_dict}") id = self.mongo_client.agile_data_science.flight_delay_classification_response.insert_one( as_dict) print(f"Inserted row, got ID: {id.inserted_id}") self.mongo_client.close() return True def close(self, error): print("Closed with error: %s" % str(error)) return True query = final_predictions.writeStream.foreach(MongoWriter()).start() query.awaitTermination()
pred_json = { "pred": pred.tolist(), "target": target.tolist(), "dist_error": dist_error, 'timeAtServer': timeAtServer, 'aircraft': aircraft } producer.send(topic=kafka_config['topics'][1], value=pred_json) if isinstance(model_params, list): schema_fields = types.StructType([ types.StructField( 'timeAtServer', types.StructType([types.StructField("0", types.FloatType())])), types.StructField( 'aircraft', types.StructType([types.StructField('0', types.IntegerType())])) ]) for field in norm_params['input_features'] + norm_params['target']: if 'latitude' in field or 'longitude' in field or 'height_' in field: schema_fields.add( types.StructField( field, types.StructType( [types.StructField('0', types.DoubleType())]))) elif 'Altitude' in field or 'diff_' in field: schema_fields.add( types.StructField(
import pyspark.sql.types as T import pyspark.sql.functions as F from operator import itemgetter as ig import requests import json from alphareader import AlphaReader import pyarrow as pa mapper = { "long": T.LongType(), "string": T.StringType(), "int": T.IntegerType(), "boolean": T.BooleanType(), "double": T.DoubleType(), "float": T.FloatType(), "timestamp-millis": T.TimestampType() } def get_registry(url, entity='user_en', version='latest'): '''http://server:port/api/v1/schemaregistry/schemas/{entity}/versions/{version}''' return json.loads( requests .get(url.format(entity, version)) .json().get("schemaText") ) def get_field(name, data_type, nullable): if isinstance(data_type, str): return T.StructField(name, ig(data_type)(mapper), bool(nullable)) try: return get_field(name, ig(1)(data_type), bool(nullable))
from pyspark.sql import SparkSession import pyspark.sql.functions as F import pyspark.sql.types as T # writeLegacyFormat is to make a spark decimal type works with hive decimal type. spark = SparkSession.builder\ .config("spark.sql.parquet.writeLegacyFormat",True)\ .enableHiveSupport().getOrCreate() df = spark.read.csv('hdfs://hive-namenode:8020/user/sqoop/restaurant_detail/part-m-00000', header=False) rename = { '_c0' : 'id', '_c1' : 'restaurant_name', '_c2' : 'category', '_c3' : 'estimated_cooking_time', '_c4' : 'latitude', '_c5' : 'longitude', } df = df.toDF(*[rename[c] for c in df.columns]) df = df.withColumn('estimated_cooking_time', F.col('estimated_cooking_time').cast(T.FloatType())) df = df.withColumn('latitude', F.col('latitude').cast(T.DecimalType(11,8))) df = df.withColumn('longitude', F.col('longitude').cast(T.DecimalType(11,8))) df = df.withColumn('dt', F.lit("latest")) df.write.parquet('hdfs://hive-namenode:8020/user/spark/transformed_restaurant_detail', partitionBy='dt', mode='overwrite')
def preprocess_kmer_file(cancer_kmers, cancer_sample, drop_cols, expression_fields, jct_col, index_name, libsize_c, cross_junction): ''' Preprocess cancer samples - Make kmers unique - Filter kmers on junction status - Normalize Parameters: ---------- cancer_kmers: cancer kmer matrix cancer_sample: associated cancer ID drop_cols: colums to be dropped expression_fields: list of segment and junction expression column names jct_col: junction status column name index_name: kmer column name libsize_c: libsize matrix for cancer samples cross_junction: Information to filter on juction status. None (both, no filtering), True (junction), False (non junction) Returns -------- cancer_kmers: cancer kmers matrix, cancer_path_tmp: path of renamed temporary file jct_type: string indicating which junction filtering has been performed ''' def collapse_values(value): return max([ np.float(i) if i != 'nan' else 0.0 for i in value.split('/') ]) # np.nanmax not supported # Filter on juction status if cross_junction == 1: cancer_kmers = cancer_kmers.filter("{} == True".format(jct_col)) elif cross_junction == 0: cancer_kmers = cancer_kmers.filter("{} == False".format(jct_col)) # Drop junction column for drop_col in drop_cols: cancer_kmers = cancer_kmers.drop(sf.col(drop_col)) logging.info("Collapse kmer horizontal") # Remove the '/' in the expression data (kmers duplicate within a gene have 'expression1/expression2' format local_max = sf.udf(collapse_values, st.FloatType()) for name_ in expression_fields: cancer_kmers = cancer_kmers.withColumn(name_, local_max(name_)) # Make kmers unique (Take max expression) logging.info("Collapse kmer vertical") cancer_kmers = cancer_kmers.withColumn( jct_col, sf.col(jct_col).cast("boolean").cast("int")) exprs = [ sf.max(sf.col(name_)).alias(name_) for name_ in cancer_kmers.schema.names if name_ != index_name ] cancer_kmers = cancer_kmers.groupBy(index_name).agg(*exprs) # Remove kmers unexpressed (both junction and segment expression null) cancer_kmers = cancer_kmers.withColumn( 'allnull', sum(cancer_kmers[name_] for name_ in expression_fields)) cancer_kmers = cancer_kmers.filter(sf.col("allnull") > 0.0) cancer_kmers = cancer_kmers.drop("allnull") # Normalize by library size if libsize_c is not None: for name_ in expression_fields: cancer_kmers = cancer_kmers.withColumn( name_, sf.round( cancer_kmers[name_] / libsize_c.loc[cancer_sample, "libsize_75percent"], 2)) else: for name_ in expression_fields: cancer_kmers = cancer_kmers.withColumn( name_, sf.round(cancer_kmers[name_], 2)) return cancer_kmers