def test_struct_type(self): struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) struct2 = StructType([ StructField("f1", StringType(), True), StructField("f2", StringType(), True, None) ]) self.assertEqual(struct1.fieldNames(), struct2.names) self.assertEqual(struct1, struct2) struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) struct2 = StructType([StructField("f1", StringType(), True)]) self.assertNotEqual(struct1.fieldNames(), struct2.names) self.assertNotEqual(struct1, struct2) struct1 = (StructType().add(StructField("f1", StringType(), True)).add( StructField("f2", StringType(), True, None))) struct2 = StructType([ StructField("f1", StringType(), True), StructField("f2", StringType(), True, None) ]) self.assertEqual(struct1.fieldNames(), struct2.names) self.assertEqual(struct1, struct2) struct1 = (StructType().add(StructField("f1", StringType(), True)).add( StructField("f2", StringType(), True, None))) struct2 = StructType([StructField("f1", StringType(), True)]) self.assertNotEqual(struct1.fieldNames(), struct2.names) self.assertNotEqual(struct1, struct2) # Catch exception raised during improper construction self.assertRaises(ValueError, lambda: StructType().add("name")) struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) for field in struct1: self.assertIsInstance(field, StructField) struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) self.assertEqual(len(struct1), 2) struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) self.assertIs(struct1["f1"], struct1.fields[0]) self.assertIs(struct1[0], struct1.fields[0]) self.assertEqual(struct1[0:1], StructType(struct1.fields[0:1])) self.assertRaises(KeyError, lambda: struct1["f9"]) self.assertRaises(IndexError, lambda: struct1[9]) self.assertRaises(TypeError, lambda: struct1[9.9])
def create(self, full_table_name: str, schema: StructType, primary_key: list, temp_source_table: str) -> str: conditions = [] updates = [] columns_to_update = set(schema.fieldNames()) - set(primary_key) for primary_key_column in primary_key: conditions.append( f"source.`{primary_key_column}` = target.`{primary_key_column}`" ) for col in columns_to_update: updates.append(f"target.`{col}` = source.`{col}`") query = (f"MERGE INTO {full_table_name} AS target\n" f"USING {temp_source_table} AS source\n" f"ON {' AND '.join(conditions)}\n" f"{{matched_clause}}" f"WHEN NOT MATCHED THEN INSERT *\n") if len(updates) > 0: query = query.format( matched_clause= f"WHEN MATCHED THEN UPDATE SET {', '.join(updates)}\n") else: query = query.format(matched_clause="") return query
inferSchema=True, samplingRatio=0.01, ignoreTrailingWhiteSpace=True, ) for column in raw.columns: new_col = column.lower() new_col = rename[new_col] if new_col in rename else new_col raw = raw.withColumnRenamed(column, new_col) except AnalysisException as e: print('AnalysisException', e) continue # raw.printSchema() df = raw for field in green_schema.fields: if field.name in df.columns: df = df.withColumn(field.name, col(field.name).astype(field.dataType)) else: df = df.withColumn(field.name, lit(None).astype(field.dataType)) df = df.select(green_schema.fieldNames())\ .withColumn('year', lit(year).astype(IntegerType()))\ .withColumn('month', lit(month).astype(IntegerType())) # df.printSchema() # df.show(vertical=True, n=5) df.write.parquet( path=destination_path.format(dataset=dataset_name), mode="append", partitionBy=["year", "month"], compression="snappy" ) job.commit()
]), True), StructField("lang", StringType(), True), StructField("text", StringType(), True), StructField("created_at", StringType(), True) ]) tweetDF = spark.read.schema(tweetSchema).json(path) display(tweetDF) # COMMAND ---------- # TEST - Run this cell to test your solution from pyspark.sql.functions import col schema = tweetSchema.fieldNames() schema.sort() tweetCount = tweetDF.filter(col("id").isNotNull()).count() dbTest("ET1-P-08-04-01", 'created_at', schema[0]) dbTest("ET1-P-08-04-02", 'id', schema[1]) dbTest("ET1-P-08-04-03", 1491, tweetCount) assert schema[0] == 'created_at' and schema[1] == 'id' assert tweetCount == 1491 print("Tests passed!") # COMMAND ---------- # MAGIC %md
StructField("fpsh_cnt", IntegerType(), False), # Feature 39 StructField("bpsh_cnt", IntegerType(), False), # Feature 40 StructField("furg_cnt", IntegerType(), False), # Feature 41 StructField("burg_cnt", IntegerType(), False), # Feature 42 StructField("total_fhlen", IntegerType(), False), # Feature 43 StructField("total_bhlen", IntegerType(), False), # Feature 44 StructField("dscp", IntegerType(), False), # Feature 45 StructField("label", StringType(), False) # Class Label ]) # Load CSV data data = spark.read.csv(sys.argv[1], schema=schema) # Create vector assembler to produce a feature vector for each record for use in MLlib # First 45 csv fields are features, the 46th field is the label. Remove IPs from features. assembler = VectorAssembler(inputCols=[schema.fieldNames()[1]]+schema.fieldNames()[3:-1], outputCol="features") # Assemble feature vector in new dataframe assembledData = assembler.transform(data) # Create a label and feature indexers to speed up categorical columns for decision tree labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(assembledData) featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=20).fit(assembledData) # Create a DecisionTree model trainer dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = assembledData.randomSplit([0.8, 0.2]) # Chain indexers and model training in a Pipeline
def transform(self, func): """ Apply function column-by-column to the GroupBy object. The function passed to `transform` must take a Series as its first argument and return a Series. The given function is executed for each series in each grouped data. While `transform` is a very flexible method, its downside is that using it can be quite a bit slower than using more specific methods like `agg` or `transform`. Koalas offers a wide range of method that will be much faster than using `transform` for their specific purposes, so try to use them before reaching for `transform`. .. note:: unlike pandas, it is required for ``func`` to specify return type hint. .. note:: the series within ``func`` is actually a pandas series. Therefore, any pandas APIs within this function is allowed. Parameters ---------- func : callable A callable that takes a Series as its first argument, and returns a Series. Returns ------- applied : DataFrame See Also -------- aggregate : Apply aggregate function to the GroupBy object. Series.apply : Apply a function to a Series. Examples -------- >>> df = ks.DataFrame({'A': [0, 0, 1], ... 'B': [1, 2, 3], ... 'C': [4, 6, 5]}, columns=['A', 'B', 'C']) >>> g = df.groupby('A') Notice that ``g`` has two groups, ``0`` and ``1``. Calling `transform` in various ways, we can get different grouping results: Below the functions passed to `transform` takes a Series as its argument and returns a Series. `transform` applies the function on each series in each grouped data, and combine them into a new DataFrame: >>> def convert_to_string(x) -> ks.Series[str]: ... return x.apply("a string {}".format) >>> g.transform(convert_to_string) # doctest: +NORMALIZE_WHITESPACE B C 0 a string 1 a string 4 1 a string 2 a string 6 2 a string 3 a string 5 >>> def plus_max(x) -> ks.Series[np.int]: ... return x + x.max() >>> g.transform(plus_max) # doctest: +NORMALIZE_WHITESPACE B C 0 3 10 1 4 12 2 6 10 """ # TODO: codes here are similar with GroupBy.apply. Needs to deduplicate. if not isinstance(func, Callable): raise TypeError("%s object is not callable" % type(func)) assert callable( func), "the first argument should be a callable function." spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) if return_sig is None: raise ValueError( "Given function must have return type hint; however, not found." ) return_type = _infer_return_type(func).tpe input_groupnames = [s.name for s in self._groupkeys] data_columns = self._kdf._internal.data_columns return_schema = StructType([ StructField(c, return_type) for c in data_columns if c not in input_groupnames ]) index_columns = self._kdf._internal.index_columns index_names = self._kdf._internal.index_names data_columns = self._kdf._internal.data_columns def rename_output(pdf): # TODO: This logic below was borrowed from `DataFrame.pandas_df` to set the index # within each pdf properly. we might have to deduplicate it. import pandas as pd if len(index_columns) > 0: append = False for index_field in index_columns: drop = index_field not in data_columns pdf = pdf.set_index(index_field, drop=drop, append=append) append = True pdf = pdf[data_columns] if len(index_names) > 0: if isinstance(pdf.index, pd.MultiIndex): pdf.index.names = index_names else: pdf.index.name = index_names[0] # pandas GroupBy.transform drops grouping columns. pdf = pdf.drop(columns=input_groupnames) pdf = pdf.transform(func) # Remaps to the original name, positionally. pdf = pdf.rename( columns=dict(zip(pdf.columns, return_schema.fieldNames()))) return pdf grouped_map_func = pandas_udf(return_schema, PandasUDFType.GROUPED_MAP)(rename_output) sdf = self._kdf._sdf input_groupkeys = [s._scol for s in self._groupkeys] sdf = sdf.groupby(*input_groupkeys).apply(grouped_map_func) internal = _InternalFrame(sdf=sdf, data_columns=return_schema.fieldNames(), index_map=[]) # index is lost. return DataFrame(internal)
.readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", brokers) \ .option("subscribe", "direction_amap") \ .option("startingOffsets", "earliest") \ .load() # 抽取value以及timestamp df = df.selectExpr("CAST(value AS STRING)", "timestamp") # value解析成JSON # 解析JSON字段为列 etl_df = df.select( from_json("value", direction_schema).alias("parsed_value"), "timestamp") for name in direction_schema.fieldNames(): etl_df = etl_df.withColumn(name, col("parsed_value").getField(name)) etl_df = etl_df.drop("parsed_value", "url", "project", "spider", "server", "strategy") etl_df = etl_df.withColumnRenamed("duration", "total_duration") etl_df = etl_df.withColumn("selected_path_steps", explode("selected_path_steps")) for name in selected_path_schema.fieldNames(): etl_df = etl_df.withColumn(name, col("selected_path_steps").getField(name)) etl_df = etl_df.drop("selected_path_steps") etl_df = etl_df.select("timestamp", "preset_route", "total_duration", "tmcs") etl_df = etl_df.withColumn("tmcs", explode("tmcs"))
StructField("user", StructType([StructField("id", IntegerType(), True)]), True), StructField("lang", StringType(), True), StructField("text", StringType(), True), StructField("created_at", StringType(), True) ]) tweetDF = (spark.read.schema(tweetSchema).json(path)) display(tweetDF) # COMMAND ---------- # TEST - Run this cell to test your solution from pyspark.sql.functions import col schema = tweetSchema.fieldNames() schema.sort() tweetCount = tweetDF.filter(col("id").isNotNull()).count() dbTest("ET1-P-08-04-01", 'created_at', schema[0]) dbTest("ET1-P-08-04-02", 'id', schema[1]) dbTest("ET1-P-08-04-03", 1491, tweetCount) assert schema[0] == 'created_at' and schema[1] == 'id' assert tweetCount == 1491 print("Tests passed!") # COMMAND ---------- # MAGIC %md
print(source_path.format( dataset=dataset_name, year=year, month=month )) schema = schemas[year] if year in schemas else fhv_schema raw = spark.read.csv( path=source_path.format( dataset=dataset_name, year=year, month=month ), schema=schema, header=True, ) # raw.printSchema() df = raw for field in fhv_schema.fields: if field.name not in df.columns: df = df.withColumn(field.name, lit(None).astype(field.dataType)) df = df.select(fhv_schema.fieldNames())\ .withColumn('year', lit(year).astype(IntegerType()))\ .withColumn('month', lit(month).astype(IntegerType())) # df.printSchema() # df.show(vertical=True, n=5) df.write.parquet( path=destination_path.format(dataset=dataset_name), mode="append", partitionBy=["year", "month"], compression="snappy" ) job.commit()