def main(inputs, output): # main logic starts here comments_schema = types.StructType([ # commented-out fields won't be read types.StructField('archived', types.BooleanType(), True), types.StructField('author', types.StringType(), True), types.StructField('author_flair_css_class', types.StringType(), True), types.StructField('author_flair_text', types.StringType(), True), types.StructField('body', types.StringType(), True), types.StructField('controversiality', types.LongType(), True), types.StructField('created_utc', types.StringType(), True), types.StructField('distinguished', types.StringType(), True), types.StructField('downs', types.LongType(), True), types.StructField('edited', types.StringType(), True), types.StructField('gilded', types.LongType(), True), types.StructField('id', types.StringType(), True), types.StructField('link_id', types.StringType(), True), types.StructField('name', types.StringType(), True), types.StructField('parent_id', types.StringType(), True), types.StructField('retrieved_on', types.LongType(), True), types.StructField('score', types.LongType(), True), types.StructField('score_hidden', types.BooleanType(), True), types.StructField('subreddit', types.StringType(), True), types.StructField('subreddit_id', types.StringType(), True), types.StructField('ups', types.LongType(), True), #types.StructField('year', types.IntegerType(), False), #types.StructField('month', types.IntegerType(), False), ]) comments = spark.read.json(inputs, schema=comments_schema) average_func = {'score': 'avg'} comments_average = comments.groupby( comments['subreddit']).agg(average_func) averages = comments_average.sort(comments['subreddit'], ascending=True) averages.write.csv(output, mode='overwrite')
def main(inputs, output): # main logic starts here comments_schema = types.StructType([ types.StructField('archived', types.BooleanType(), True), types.StructField('author', types.StringType(), True), types.StructField('author_flair_css_class', types.StringType(), True), types.StructField('author_flair_text', types.StringType(), True), types.StructField('body', types.StringType(), True), types.StructField('controversiality', types.LongType(), True), types.StructField('created_utc', types.StringType(), True), types.StructField('distinguished', types.StringType(), True), types.StructField('downs', types.LongType(), True), types.StructField('edited', types.StringType(), True), types.StructField('gilded', types.LongType(), True), types.StructField('id', types.StringType(), True), types.StructField('link_id', types.StringType(), True), types.StructField('name', types.StringType(), True), types.StructField('parent_id', types.StringType(), True), types.StructField('retrieved_on', types.LongType(), True), types.StructField('score', types.LongType(), True), types.StructField('score_hidden', types.BooleanType(), True), types.StructField('subreddit', types.StringType(), True), types.StructField('subreddit_id', types.StringType(), True), types.StructField('ups', types.LongType(), True), ]) comments = spark.read.json(inputs, schema=comments_schema) averages = comments.groupby('subreddit').agg( functions.avg(comments['score'])) averages.show() averages.write.csv(output, mode='overwrite') averages.explain()
def main(inputs, output): comments_schema = types.StructType([ types.StructField('archived', types.BooleanType()), types.StructField('author', types.StringType()), types.StructField('author_flair_css_class', types.StringType()), types.StructField('author_flair_text', types.StringType()), types.StructField('body', types.StringType()), types.StructField('controversiality', types.LongType()), types.StructField('created_utc', types.StringType()), types.StructField('distinguished', types.StringType()), types.StructField('downs', types.LongType()), types.StructField('edited', types.StringType()), types.StructField('gilded', types.LongType()), types.StructField('id', types.StringType()), types.StructField('link_id', types.StringType()), types.StructField('name', types.StringType()), types.StructField('parent_id', types.StringType()), types.StructField('retrieved_on', types.LongType()), types.StructField('score', types.LongType()), types.StructField('score_hidden', types.BooleanType()), types.StructField('subreddit', types.StringType()), types.StructField('subreddit_id', types.StringType()), types.StructField('ups', types.LongType()), # types.StructField('year', types.IntegerType()), # types.StructField('month', types.IntegerType()), ]) df = spark.read.json(inputs, schema=comments_schema) averages = df.groupBy(df['subreddit']).agg( functions.avg(df['score']).alias('average_score')) averages.write.csv(output, mode='overwrite') averages.explain()
def main(inputs, output): # main logic starts here comments_schema = types.StructType([ # commented-out fields won't be read types.StructField('archived', types.BooleanType(), True), types.StructField('author', types.StringType(), True), types.StructField('author_flair_css_class', types.StringType(), True), types.StructField('author_flair_text', types.StringType(), True), types.StructField('body', types.StringType(), True), types.StructField('controversiality', types.LongType(), True), types.StructField('created_utc', types.StringType(), True), types.StructField('distinguished', types.StringType(), True), types.StructField('downs', types.LongType(), True), types.StructField('edited', types.StringType(), True), types.StructField('gilded', types.LongType(), True), types.StructField('id', types.StringType(), True), types.StructField('link_id', types.StringType(), True), types.StructField('name', types.StringType(), True), types.StructField('parent_id', types.StringType(), True), types.StructField('retrieved_on', types.LongType(), True), types.StructField('score', types.LongType(), True), types.StructField('score_hidden', types.BooleanType(), True), types.StructField('subreddit', types.StringType(), True), types.StructField('subreddit_id', types.StringType(), True), types.StructField('ups', types.LongType(), True), ]) comments = spark.read.json( inputs, schema=comments_schema) #read input file into a dataframe subreddit_averages = comments.groupby('subreddit').avg( 'score') #calculate average for each subreddit subreddit_averages.write.csv(output, mode='overwrite')
def getRedditDataFrameSchema(self): return tp.StructType([ tp.StructField('show_title', tp.StringType(), True), tp.StructField('show_director', tp.StringType(), True), tp.StructField('submission_id', tp.StringType(), True), tp.StructField('source', tp.StringType(), True), tp.StructField('title', tp.StringType(), True), tp.StructField('description', tp.StringType(), True), tp.StructField('created_utc', tp.TimestampType(), True), tp.StructField('author', tp.StringType(), True), tp.StructField('score', tp.IntegerType(), True), tp.StructField('spoiler', tp.BooleanType(), True), tp.StructField('is_original_content', tp.BooleanType(), True), tp.StructField('distinguished', tp.StringType(), True), tp.StructField('link', tp.StringType(), True), tp.StructField( 'comments', tp.ArrayType( tp.StructType([ tp.StructField('comment_id', tp.StringType(), True), tp.StructField('body', tp.StringType(), True), tp.StructField('created_utc', tp.TimestampType(), True), tp.StructField('score', tp.IntegerType(), True), tp.StructField('parent_id', tp.StringType(), True), tp.StructField('submission_id', tp.StringType(), True) ])), True) ])
def main(inputs, output): # main logic starts here comments_schema = types.StructType([ types.StructField('archived', types.BooleanType()), types.StructField('author', types.StringType()), types.StructField('author_flair_css_class', types.StringType()), types.StructField('author_flair_text', types.StringType()), types.StructField('body', types.StringType()), types.StructField('controversiality', types.LongType()), types.StructField('created_utc', types.StringType()), types.StructField('distinguished', types.StringType()), types.StructField('downs', types.LongType()), types.StructField('edited', types.StringType()), types.StructField('gilded', types.LongType()), types.StructField('id', types.StringType()), types.StructField('link_id', types.StringType()), types.StructField('name', types.StringType()), types.StructField('parent_id', types.StringType()), types.StructField('retrieved_on', types.LongType()), types.StructField('score', types.LongType()), types.StructField('score_hidden', types.BooleanType()), types.StructField('subreddit', types.StringType()), types.StructField('subreddit_id', types.StringType()), types.StructField('ups', types.LongType()), ]) #reading data from json commentsData = spark.read.json(inputs, schema=comments_schema) #grouping by subreddit and calculating avg score averages = commentsData.groupBy('subreddit').agg( functions.avg(commentsData['score'])) #writing output in csv file averages.write.csv(output, mode='overwrite') averages.explain()
def main(inputs, output): # main logic starts here comments_schema = types.StructType([ # commented-out fields won't be read types.StructField('archived', types.BooleanType(), True), types.StructField('author', types.StringType(), True), types.StructField('author_flair_css_class', types.StringType(), True), types.StructField('author_flair_text', types.StringType(), True), types.StructField('body', types.StringType(), True), types.StructField('controversiality', types.LongType(), True), types.StructField('created_utc', types.StringType(), True), types.StructField('distinguished', types.StringType(), True), types.StructField('downs', types.LongType(), True), types.StructField('edited', types.StringType(), True), types.StructField('gilded', types.LongType(), True), types.StructField('id', types.StringType(), True), types.StructField('link_id', types.StringType(), True), types.StructField('name', types.StringType(), True), types.StructField('parent_id', types.StringType(), True), types.StructField('retrieved_on', types.LongType(), True), types.StructField('score', types.LongType(), True), types.StructField('score_hidden', types.BooleanType(), True), types.StructField('subreddit', types.StringType(), True), types.StructField('subreddit_id', types.StringType(), True), types.StructField('ups', types.LongType(), True), #types.StructField('year', types.IntegerType(), False), #types.StructField('month', types.IntegerType(), False), ]) comments = spark.read.json(inputs, schema=comments_schema) find_avg = comments.groupBy((comments.subreddit).alias("Subreddit")).agg( avg(comments.score).alias("Average")) averages = find_avg.orderBy(asc("Subreddit")).coalesce(1) averages.write.csv(output, mode='overwrite')
def df_regex_make(wikiqtsv): # make wikiq tsv into a dataframe tsv2df = reader.csv(wikiqtsv, sep="\t", inferSchema=False, header=True, mode="PERMISSIVE", quote="") #tsv2df = tsv2df.repartition(args.num_partitions) # basic structure struct = types.StructType().add("anon",types.StringType(),True) struct = struct.add("articleid",types.LongType(),True) struct = struct.add("date_time",types.TimestampType(), True) struct = struct.add("deleted",types.BooleanType(), True) struct = struct.add("editor",types.StringType(),True) struct = struct.add("editor_id",types.LongType(), True) struct = struct.add("minor", types.BooleanType(), True) struct = struct.add("namespace", types.LongType(), True) struct = struct.add("revert", types.BooleanType(), True) struct = struct.add("reverteds", types.StringType(), True) struct = struct.add("revid", types.LongType(), True) struct = struct.add("sha1", types.StringType(), True) struct = struct.add("text_chars", types.LongType(), True) struct = struct.add("title",types.StringType(), True) # structure the df to get the def with columns of metadata and regexes regex_one_df = df_structurize(tsv2df,struct) return regex_one_df
def main(inputs, output): comments_schema = types.StructType([ types.StructField('archived', types.BooleanType(), True), types.StructField('author', types.StringType(), True), types.StructField('author_flair_css_class', types.StringType(), True), types.StructField('author_flair_text', types.StringType(), True), types.StructField('body', types.StringType(), True), types.StructField('controversiality', types.LongType(), True), types.StructField('created_utc', types.StringType(), True), types.StructField('distinguished', types.StringType(), True), types.StructField('downs', types.LongType(), True), types.StructField('edited', types.StringType(), True), types.StructField('gilded', types.LongType(), True), types.StructField('id', types.StringType(), True), types.StructField('link_id', types.StringType(), True), types.StructField('name', types.StringType(), True), types.StructField('parent_id', types.StringType(), True), types.StructField('retrieved_on', types.LongType(), True), types.StructField('score', types.LongType(), True), types.StructField('score_hidden', types.BooleanType(), True), types.StructField('subreddit', types.StringType(), True), types.StructField('subreddit_id', types.StringType(), True), types.StructField('ups', types.LongType(), True) ]) comments = spark.read.json(inputs, schema=comments_schema) comments_avg = comments.groupBy('subreddit').avg('score') comments_avg.explain() comments_avg.write.json(output, mode='overwrite')
def main(inputs, output): # main logic starts here comments_schema = types.StructType([ # commented-out fields won't be read types.StructField('archived', types.BooleanType(), True), types.StructField('author', types.StringType(), True), types.StructField('author_flair_css_class', types.StringType(), True), types.StructField('author_flair_text', types.StringType(), True), types.StructField('body', types.StringType(), True), types.StructField('controversiality', types.LongType(), True), types.StructField('created_utc', types.StringType(), True), types.StructField('distinguished', types.StringType(), True), types.StructField('downs', types.LongType(), True), types.StructField('edited', types.StringType(), True), types.StructField('gilded', types.LongType(), True), types.StructField('id', types.StringType(), True), types.StructField('link_id', types.StringType(), True), types.StructField('name', types.StringType(), True), types.StructField('parent_id', types.StringType(), True), types.StructField('retrieved_on', types.LongType(), True), types.StructField('score', types.LongType(), True), types.StructField('score_hidden', types.BooleanType(), True), types.StructField('subreddit', types.StringType(), True), types.StructField('subreddit_id', types.StringType(), True), types.StructField('ups', types.LongType(), True), #types.StructField('year', types.IntegerType(), False), #types.StructField('month', types.IntegerType(), False), ]) #inp = '/courses/732/reddit-1/' # or other path on your computer # comments = spark.read.json(inputs) comments = spark.read.json(inputs, schema=comments_schema) averages = comments.groupby('subreddit').agg(functions.avg(comments['score'])) averages.explain() averages.show() averages.write.csv(output, mode='overwrite')
def test_datatype(self): first = T.StructType([ T.StructField('f1', T.BooleanType()), T.StructField('f2', T.ByteType()), T.StructField('f3', T.IntegerType()), T.StructField('f4', T.LongType()), ]) second = T.StructType([ T.StructField('f3', T.IntegerType()), T.StructField('f2', T.ByteType()), T.StructField('f4', T.LongType()), T.StructField('f1', T.BooleanType()), ]) SparklyTest().assertRowsEqual(first, second, ignore_order=True) with self.assertRaises(AssertionError): self.assertEqual(first, second) # change entry (f4, T.LongType) second = T.StructType([ T.StructField('f3', T.IntegerType()), T.StructField('f2', T.ByteType()), T.StructField('f4', T.StringType()), T.StructField('f1', T.BooleanType()), ]) with self.assertRaises(AssertionError): SparklyTest().assertRowsEqual(first, second, ignore_order=True)
def schema_extra_missing_non_nullable_field() -> T.StructType: """Return an sample spark schema with an extra field defined.""" return ( T.StructType([ T.StructField("name", T.StringType(), True), T.StructField("empid", T.IntegerType(), True), T.StructField("happy", T.BooleanType(), True), T.StructField("extra", T.BooleanType(), False)]) )
def _metadata_schema() -> t.StructType: return t.StructType([ t.StructField('tenant_col', t.StringType(), False), t.StructField('user_col', t.StringType(), False), t.StructField('user_vec_col', t.StringType(), False), t.StructField('res_col', t.StringType(), False), t.StructField('res_vec_col', t.StringType(), False), t.StructField('output_col', t.StringType(), False), t.StructField('has_history_access_df', t.BooleanType(), False), t.StructField('has_user2component_mappings_df', t.BooleanType(), False), t.StructField('has_res2component_mappings_df', t.BooleanType(), False), t.StructField('has_user_feature_vector_mapping_df', t.BooleanType(), False), t.StructField('has_res_feature_vector_mapping_df', t.BooleanType(), False) ])
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes, ): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date, ): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (): return types.ArrayType(types.StringType())
def infer_spark_type(typeclass) -> t.DataType: if typeclass in (None, NoneType): return t.NullType() elif typeclass is str: return t.StringType() elif typeclass in {bytes, bytearray}: return t.BinaryType() elif typeclass is bool: return t.BooleanType() elif typeclass is date: return t.DateType() elif typeclass is datetime: return t.TimestampType() elif typeclass is Decimal: return t.DecimalType(precision=36, scale=6) elif isinstance(typeclass, type) and issubclass(typeclass, BoundDecimal): (precision, scale) = typeclass.__constraints__ return t.DecimalType(precision=precision, scale=scale) elif typeclass is float: return t.DoubleType() elif typeclass is int: return t.IntegerType() elif typeclass is long: return t.LongType() elif typeclass is short: return t.ShortType() elif typeclass is byte: return t.ByteType() elif getattr(typeclass, "__origin__", None) is not None: return infer_complex_spark_type(typeclass) elif is_pyspark_class(typeclass): return transform(typeclass) else: raise TypeError(f"Don't know how to represent {typeclass} in Spark")
def _numpy_to_spark_mapping(): """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation of multiple objects in each call.""" # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot' # notation to avoid copy/paste/typo mistakes cache_attr_name = 'cached_numpy_to_pyspark_types_map' if not hasattr(_numpy_to_spark_mapping, cache_attr_name): import pyspark.sql.types as T setattr(_numpy_to_spark_mapping, cache_attr_name, { np.int8: T.ByteType(), np.uint8: T.ShortType(), np.int16: T.ShortType(), np.uint16: T.IntegerType(), np.int32: T.IntegerType(), np.int64: T.LongType(), np.float32: T.FloatType(), np.float64: T.DoubleType(), np.string_: T.StringType(), np.str_: T.StringType(), np.unicode_: T.StringType(), np.bool_: T.BooleanType(), }) return getattr(_numpy_to_spark_mapping, cache_attr_name)
def step_03_join(self): # TODO: # - Join all result of step_02 based on the group by attributes. # - For each metrics, renamed it to "datasource: metric_name" # - For each combination of datasource, calculate data difference column # - Calculate a test_result column if every related metric matches (If only 2 input sources is provided) group_by = self.config["group_by"] # Rename every metric with prefix as source_metricname for source, agg in self.agg.items(): metric_cols = list(filter(lambda x: x not in group_by, agg.columns)) self.agg[source] = reduce( lambda df, metric: df.withColumnRenamed( metric, source + "_" + metric), metric_cols, agg) # Join joined = reduce(lambda x, y: x.join(y, how="full", on=group_by), self.agg.values()) # Calculate differences if there are only two sources if len(self.agg) == 2: source1, source2 = tuple(self.config["data"].keys()) source1_metrics = list( self.config["data"][source1]["metrics"].keys()) source2_metrics = list( self.config["data"][source2]["metrics"].keys()) # Look for same metrics in both sources # I know that it could be done in O(n), this is more readable shared_metrics = sorted( set(source1_metrics) & set(source2_metrics)) for metric in shared_metrics: try: joined = joined.withColumn( "delta_" + metric, F.abs( F.col(source1 + "_" + metric) - F.col(source2 + "_" + metric))) except: # Cannot calculate difference, eg in case the metric is string pass # For float and double type, the acceptance rate is 0.1 percent if dict(joined.dtypes)[source1 + "_" + metric] in ("float", "double") \ or dict(joined.dtypes)[source2 + "_" + metric] in ("float", "double"): def difference(number1, number2, error=1e-3): return abs((number1 - number2) / number2) < error joined = joined.withColumn( "match_" + metric, F.udf(difference, T.BooleanType())(F.col(source1 + "_" + metric), F.col(source2 + "_" + metric))) else: joined = joined.withColumn( "match_" + metric, F.col(source1 + "_" + metric) == F.col(source2 + "_" + metric)) self.joined = joined return joined
def get_spark_data_type(input_value): return { "str": T.StringType(), "int": T.LongType(), "bool": T.BooleanType(), "float": T.DoubleType(), "NoneType": T.NullType(), }[type(input_value).__name__]
def schema_different_uncastable_data_field() -> T.StructType: """Return an sample spark schema with uncastable change in datatype.""" return ( T.StructType([ T.StructField("name", T.IntegerType(), True), T.StructField("empid", T.StringType(), True), T.StructField("happy", T.BooleanType(), True)]) )
def create_valid_schema() -> T.StructType: """Return a spark schema.""" return ( T.StructType([ T.StructField("name", T.StringType(), True), T.StructField("empid", T.IntegerType(), True), T.StructField("happy", T.BooleanType(), True)]) )
def field_to_spark_field(name, python_type) -> spark_types.StructField: spark_type = { int: spark_types.IntegerType(), float: spark_types.DoubleType(), str: spark_types.StringType(), datetime: spark_types.TimestampType(), bool: spark_types.BooleanType(), }[python_type] return spark_types.StructField(name, spark_type)
def spark_data_flow(): get_phone_label_udf = fun.udf(lambda x: 'Entity;Contact;Phone', tp.StringType()) get_email_label_udf = fun.udf(lambda x: 'Entity;Contact;Email', tp.StringType()) get_phone_type_udf = fun.udf(lambda x: 'PHONE', tp.StringType()) get_email_type_udf = fun.udf(lambda x: 'EMAIL', tp.StringType()) filter_comma_udf = fun.udf(filter_comma, tp.BooleanType()) raw_nb_df = spark.sql(""" SELECT bbd_qyxx_id, phone, email, year FROM dw.qyxx_annual_report_jbxx WHERE dt='{version}' """.format(version=XGXX_RELATION)) tid_nb_df = raw_nb_df.where("bbd_qyxx_id != 'null'").where( "phone != 'null'").where("email != 'null'").where( raw_nb_df.bbd_qyxx_id.isNotNull()).where( raw_nb_df.phone.isNotNull()).where( raw_nb_df.email.isNotNull()).where( filter_comma_udf('bbd_qyxx_id')).where( filter_comma_udf('phone')).where( filter_comma_udf('email')).cache() prd_phone_node_df = tid_nb_df.select( tid_nb_df.phone.alias('bbd_contact_id:ID'), fun.unix_timestamp().alias('create_time:long'), fun.unix_timestamp().alias('update_time:long'), get_phone_label_udf('phone').alias(':LABEL')).distinct() prd_phone_edge_df = tid_nb_df.select( tid_nb_df.bbd_qyxx_id.alias(':START_ID'), tid_nb_df.phone.alias(':END_ID'), tid_nb_df.year.alias('year'), fun.unix_timestamp().alias('create_time:long'), get_phone_type_udf('phone').alias(':TYPE')).distinct() prd_email_node_df = tid_nb_df.select( tid_nb_df.email.alias('bbd_contact_id:ID'), fun.unix_timestamp().alias('create_time:long'), fun.unix_timestamp().alias('update_time:long'), get_email_label_udf('email').alias(':LABEL')).distinct() prd_email_edge_df = tid_nb_df.select( tid_nb_df.bbd_qyxx_id.alias(':START_ID'), tid_nb_df.email.alias(':END_ID'), tid_nb_df.year.alias('year'), fun.unix_timestamp().alias('create_time:long'), get_email_type_udf('email').alias(':TYPE')).distinct() return (prd_phone_node_df, prd_phone_edge_df, prd_email_node_df, prd_email_edge_df)
def find_type(x): if x.type in ['object', 'str']: return T.StringType() elif x.type == 'int': return T.IntegerType() elif x.dtype == 'float': return T.FloatType() elif x.dtype == 'bool': return T.BooleanType() raise TypeError('%s type is unknown' % (x.dtype))
def build_training_set(inputs: DataFrame) -> DataFrame: udf_country = fn.udf(Udfs.country, st.StringType()) udf_currency = fn.udf(Udfs.currency, st.StringType()) udf_is_valid_label = fn.udf(Udfs.is_valid_label, st.BooleanType()) udf_filter = fn.udf(Udfs.filter_hours_days_goal, st.BooleanType()) replace_values = { 'days_campaign': -1, 'hours_prepa': -1, 'goal': -1, 'country_clean': 'unknown', 'currency_clean': 'unknown' } result = inputs.withColumn('goal', fn.col('goal').cast(st.DoubleType())) \ .withColumn('deadline', fn.col('deadline').cast(st.IntegerType())) \ .withColumn('state_changed_at', fn.col('state_changed_at').cast(st.IntegerType())) \ .withColumn('created_at', fn.col('created_at').cast(st.IntegerType())) \ .withColumn('launched_at', fn.col('launched_at').cast(st.IntegerType())) \ .drop('disable_communication') \ .drop('state_changed_at', 'backers_count') \ .withColumn('country_clean', udf_country(fn.col('country'), fn.col('currency'))) \ .withColumn('currency_clean', udf_currency(fn.col('currency'))) \ .drop('country', 'currency') \ .filter(udf_is_valid_label(fn.col('final_status'))) \ .withColumn("deadline_clean", fn.to_date(fn.from_unixtime(fn.col('deadline')))) \ .withColumn("created_at_clean", fn.to_date(fn.from_unixtime(fn.col('created_at')))) \ .withColumn("launched_at_clean", fn.to_date(fn.from_unixtime(fn.col('launched_at')))) \ .withColumn("days_campaign", fn.datediff(fn.col('deadline_clean'), fn.col('launched_at_clean'))) \ .withColumn("hours_prepa", fn.round((fn.col('launched_at') - fn.col('created_at')) / 3600, 2)) \ .filter(udf_filter(fn.col('hours_prepa'), fn.col('days_campaign'), fn.col('goal'))) \ .drop('created_at', 'launched_at', 'deadline') \ .withColumn("name", fn.lower(fn.col('name'))) \ .withColumn("desc", fn.lower(fn.col('desc'))) \ .withColumn("keywords", fn.lower(fn.col('keywords'))) \ .withColumn("text", fn.concat_ws(" ", fn.col('name'), fn.col('desc'), fn.col('keywords'))) \ .drop("name", "desc", "keywords") \ .na.fill(replace_values) return result
def labeled_msg_schema(): schema = types.StructType([ types.StructField('optional_field', types.BooleanType()), types.StructField('required_field', types.DoubleType(), nullable=False), types.StructField( 'repeated_field', types.ArrayType(types.IntegerType(), containsNull=False)), types.StructField('default_field', types.StringType()), ]) return schema
def clean(spark, rows): # Load Data df = spark.createDataFrame(Row(**row) for row in rows) # Clean column country re_country = "[a-zA-Z][a-zA-Z\s\-]*" df = df.withColumn( "country", (F.lower(F.trim(F.regexp_extract("country", re_country, 0)))), ) # Clean column campus re_campus = "([a-zA-Z]+[_\ \-]?)+" df = df.withColumn( "campus", (F.lower(F.trim(F.regexp_extract("campus", re_campus, 0))))) # Clean column mobility re_mobility = "([a-zA-Z0-9]+[\ \-]?)+" df = df.withColumn( "mobility", (F.lower(F.trim(F.regexp_extract("mobility", re_mobility, 0))))) # Clean column contracts df = df.withColumn( "contracts", null_negative_int(df["contracts"].cast(T.IntegerType()))) # Clean column alternative_choice re_alternative_choice = "([a-zA-Z]+[_\ \-]?)+" df = df.withColumn( "alternative_choice", (F.lower( F.trim( F.regexp_extract("alternative_choice", re_alternative_choice, 0)))), ) # Clean column distance re_distance = "[0-9]+" df = df.withColumn( "distance", (F.lower(F.trim(F.regexp_extract("distance", re_distance, 0))).cast( T.IntegerType())), ) # Clean column pro_contract df = df.withColumn("pro_contract", df["pro_contract"].cast(T.BooleanType())) return df
def get_dtypes_spark(type): switcher = { 'int32': st.IntegerType(), 'int64': st.LongType(), 'float32': st.FloatType(), 'float64': st.DoubleType(), 'date64': st.DateType(), #TimestampType 'str': st.StringType(), 'boolean': st.BooleanType() } func = switcher.get(type, "nothing") # Execute the function return func
def get_schema(): return TableSchema( [ t.StructField("LoanID", t.StringType(), True), t.StructField("Rating", t.StringType(), True), t.StructField("Country", t.StringType(), True), t.StructField("Defaulted", t.BooleanType(), False), t.StructField("Year", t.IntegerType(), True), t.StructField("Month", t.IntegerType(), True), ], primary_key="LoanID", partition_by=["Month"], tbl_properties={"Test": "test"}, )
def as_spark_type(tpe) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # TODO: Add "boolean" and "string" types. # ArrayType if tpe in (np.ndarray,): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list): return types.ArrayType(as_spark_type(tpe.__args__[0])) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date,): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal,): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int, np.int64, "int", "int64", "long", "bigint"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampType() else: raise TypeError("Type %s was not understood." % tpe)
def get_ppd_tables( spark: SparkSession, input_path, headers_path: str, output_path: Optional[str] = None, ) -> Dict[str, DataFrame]: """ Process Price Paid Dataset (PPD) by extracting fact and dimension tables and saving them to S3 in the csv format. Args: spark: Current Spark session object. input_path: Path to the PPD dataset in the CSV format. output_path: Path where the resulting csv files are saved. headers_path: Path to the PPD headers TSV file. Returns: A dictionary mapping table name to its dataframe. """ df = read_ppd_table(spark, input_path, headers_path) property_types = { "D": "detached", "S": "semi-detached", "T": "terraced", "F": "flat", "O": "other", } property_types = spark.sparkContext.broadcast(property_types) df = (df.withColumn( "property_type", f.udf(lambda x: property_types.value[x], t.StringType())(f.col("property_type")), ).withColumn( "is_new", f.udf(lambda x: True if x == "Y" else False, t.BooleanType())(f.col("old_new")), ).withColumn( "duration", f.udf(lambda x: "freehold" if x == "F" else "leasehold", t.StringType())(f.col("duration")), ).withColumn("date", f.to_date(df["date_of_transfer"]))) df = _normalise_address(df).select( [column for column in df.columns if column not in {"old_new"}] + ["is_new", "date", "property_address"]) tables = { "property": write_property_table(df, output_path), "time": write_time_table(df, output_path), "sale": write_sale_table(df, output_path), } return tables