Exemplo n.º 1
0
    def test_struct_type(self):
        struct1 = StructType().add("f1", StringType(),
                                   True).add("f2", StringType(), True, None)
        struct2 = StructType([
            StructField("f1", StringType(), True),
            StructField("f2", StringType(), True, None)
        ])
        self.assertEqual(struct1.fieldNames(), struct2.names)
        self.assertEqual(struct1, struct2)

        struct1 = StructType().add("f1", StringType(),
                                   True).add("f2", StringType(), True, None)
        struct2 = StructType([StructField("f1", StringType(), True)])
        self.assertNotEqual(struct1.fieldNames(), struct2.names)
        self.assertNotEqual(struct1, struct2)

        struct1 = (StructType().add(StructField("f1", StringType(), True)).add(
            StructField("f2", StringType(), True, None)))
        struct2 = StructType([
            StructField("f1", StringType(), True),
            StructField("f2", StringType(), True, None)
        ])
        self.assertEqual(struct1.fieldNames(), struct2.names)
        self.assertEqual(struct1, struct2)

        struct1 = (StructType().add(StructField("f1", StringType(), True)).add(
            StructField("f2", StringType(), True, None)))
        struct2 = StructType([StructField("f1", StringType(), True)])
        self.assertNotEqual(struct1.fieldNames(), struct2.names)
        self.assertNotEqual(struct1, struct2)

        # Catch exception raised during improper construction
        self.assertRaises(ValueError, lambda: StructType().add("name"))

        struct1 = StructType().add("f1", StringType(),
                                   True).add("f2", StringType(), True, None)
        for field in struct1:
            self.assertIsInstance(field, StructField)

        struct1 = StructType().add("f1", StringType(),
                                   True).add("f2", StringType(), True, None)
        self.assertEqual(len(struct1), 2)

        struct1 = StructType().add("f1", StringType(),
                                   True).add("f2", StringType(), True, None)
        self.assertIs(struct1["f1"], struct1.fields[0])
        self.assertIs(struct1[0], struct1.fields[0])
        self.assertEqual(struct1[0:1], StructType(struct1.fields[0:1]))
        self.assertRaises(KeyError, lambda: struct1["f9"])
        self.assertRaises(IndexError, lambda: struct1[9])
        self.assertRaises(TypeError, lambda: struct1[9.9])
    def create(self, full_table_name: str, schema: StructType,
               primary_key: list, temp_source_table: str) -> str:
        conditions = []
        updates = []
        columns_to_update = set(schema.fieldNames()) - set(primary_key)

        for primary_key_column in primary_key:
            conditions.append(
                f"source.`{primary_key_column}` = target.`{primary_key_column}`"
            )

        for col in columns_to_update:
            updates.append(f"target.`{col}` = source.`{col}`")

        query = (f"MERGE INTO {full_table_name} AS target\n"
                 f"USING {temp_source_table} AS source\n"
                 f"ON {' AND '.join(conditions)}\n"
                 f"{{matched_clause}}"
                 f"WHEN NOT MATCHED THEN INSERT *\n")

        if len(updates) > 0:
            query = query.format(
                matched_clause=
                f"WHEN MATCHED THEN UPDATE SET {', '.join(updates)}\n")
        else:
            query = query.format(matched_clause="")

        return query
Exemplo n.º 3
0
            inferSchema=True, samplingRatio=0.01,
            ignoreTrailingWhiteSpace=True,
        )
        for column in raw.columns:
            new_col = column.lower()
            new_col = rename[new_col] if new_col in rename else new_col
            raw = raw.withColumnRenamed(column, new_col)
    except AnalysisException as e:
        print('AnalysisException', e)
        continue
    # raw.printSchema()
    df = raw
    for field in green_schema.fields:
        if field.name in df.columns:
            df = df.withColumn(field.name, col(field.name).astype(field.dataType))
        else:
            df = df.withColumn(field.name, lit(None).astype(field.dataType))
    df = df.select(green_schema.fieldNames())\
        .withColumn('year', lit(year).astype(IntegerType()))\
        .withColumn('month', lit(month).astype(IntegerType()))
    # df.printSchema()
    # df.show(vertical=True, n=5)
    df.write.parquet(
        path=destination_path.format(dataset=dataset_name),
        mode="append",
        partitionBy=["year", "month"],
        compression="snappy"
    )

job.commit()
Exemplo n.º 4
0
  ]), True),  
  StructField("lang", StringType(), True),
  StructField("text", StringType(), True),
  StructField("created_at", StringType(), True)
])

tweetDF = spark.read.schema(tweetSchema).json(path)

display(tweetDF)

# COMMAND ----------

# TEST - Run this cell to test your solution
from pyspark.sql.functions import col

schema = tweetSchema.fieldNames()
schema.sort()
tweetCount = tweetDF.filter(col("id").isNotNull()).count()

dbTest("ET1-P-08-04-01", 'created_at', schema[0])
dbTest("ET1-P-08-04-02", 'id', schema[1])
dbTest("ET1-P-08-04-03", 1491, tweetCount)

assert schema[0] == 'created_at' and schema[1] == 'id'
assert tweetCount == 1491

print("Tests passed!")

# COMMAND ----------

# MAGIC %md
Exemplo n.º 5
0
        StructField("fpsh_cnt", IntegerType(), False),          # Feature 39
        StructField("bpsh_cnt", IntegerType(), False),          # Feature 40
        StructField("furg_cnt", IntegerType(), False),          # Feature 41
        StructField("burg_cnt", IntegerType(), False),          # Feature 42
        StructField("total_fhlen", IntegerType(), False),       # Feature 43
        StructField("total_bhlen", IntegerType(), False),       # Feature 44
        StructField("dscp", IntegerType(), False),              # Feature 45
        StructField("label", StringType(), False)               # Class Label
    ])

    # Load CSV data
    data = spark.read.csv(sys.argv[1], schema=schema)

    # Create vector assembler to produce a feature vector for each record for use in MLlib
    # First 45 csv fields are features, the 46th field is the label. Remove IPs from features.
    assembler = VectorAssembler(inputCols=[schema.fieldNames()[1]]+schema.fieldNames()[3:-1], outputCol="features")

    # Assemble feature vector in new dataframe
    assembledData = assembler.transform(data)
    
    # Create a label and feature indexers to speed up categorical columns for decision tree
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(assembledData)
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=20).fit(assembledData)

    # Create a DecisionTree model trainer
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = assembledData.randomSplit([0.8, 0.2])
    
    # Chain indexers and model training in a Pipeline
Exemplo n.º 6
0
    def transform(self, func):
        """
        Apply function column-by-column to the GroupBy object.

        The function passed to `transform` must take a Series as its first
        argument and return a Series. The given function is executed for
        each series in each grouped data.

        While `transform` is a very flexible method, its downside is that
        using it can be quite a bit slower than using more specific methods
        like `agg` or `transform`. Koalas offers a wide range of method that will
        be much faster than using `transform` for their specific purposes, so try to
        use them before reaching for `transform`.

        .. note:: unlike pandas, it is required for ``func`` to specify return type hint.

        .. note:: the series within ``func`` is actually a pandas series. Therefore,
            any pandas APIs within this function is allowed.

        Parameters
        ----------
        func : callable
            A callable that takes a Series as its first argument, and
            returns a Series.

        Returns
        -------
        applied : DataFrame

        See Also
        --------
        aggregate : Apply aggregate function to the GroupBy object.
        Series.apply : Apply a function to a Series.

        Examples
        --------

        >>> df = ks.DataFrame({'A': [0, 0, 1],
        ...                    'B': [1, 2, 3],
        ...                    'C': [4, 6, 5]}, columns=['A', 'B', 'C'])

        >>> g = df.groupby('A')

        Notice that ``g`` has two groups, ``0`` and ``1``.
        Calling `transform` in various ways, we can get different grouping results:
        Below the functions passed to `transform` takes a Series as
        its argument and returns a Series. `transform` applies the function on each series
        in each grouped data, and combine them into a new DataFrame:

        >>> def convert_to_string(x) -> ks.Series[str]:
        ...    return x.apply("a string {}".format)
        >>> g.transform(convert_to_string)  # doctest: +NORMALIZE_WHITESPACE
                    B           C
        0  a string 1  a string 4
        1  a string 2  a string 6
        2  a string 3  a string 5

        >>> def plus_max(x) -> ks.Series[np.int]:
        ...    return x + x.max()
        >>> g.transform(plus_max)  # doctest: +NORMALIZE_WHITESPACE
           B   C
        0  3  10
        1  4  12
        2  6  10
        """
        # TODO: codes here are similar with GroupBy.apply. Needs to deduplicate.
        if not isinstance(func, Callable):
            raise TypeError("%s object is not callable" % type(func))

        assert callable(
            func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        if return_sig is None:
            raise ValueError(
                "Given function must have return type hint; however, not found."
            )

        return_type = _infer_return_type(func).tpe
        input_groupnames = [s.name for s in self._groupkeys]
        data_columns = self._kdf._internal.data_columns
        return_schema = StructType([
            StructField(c, return_type) for c in data_columns
            if c not in input_groupnames
        ])

        index_columns = self._kdf._internal.index_columns
        index_names = self._kdf._internal.index_names
        data_columns = self._kdf._internal.data_columns

        def rename_output(pdf):
            # TODO: This logic below was borrowed from `DataFrame.pandas_df` to set the index
            #   within each pdf properly. we might have to deduplicate it.
            import pandas as pd

            if len(index_columns) > 0:
                append = False
                for index_field in index_columns:
                    drop = index_field not in data_columns
                    pdf = pdf.set_index(index_field, drop=drop, append=append)
                    append = True
                pdf = pdf[data_columns]

            if len(index_names) > 0:
                if isinstance(pdf.index, pd.MultiIndex):
                    pdf.index.names = index_names
                else:
                    pdf.index.name = index_names[0]

            # pandas GroupBy.transform drops grouping columns.
            pdf = pdf.drop(columns=input_groupnames)
            pdf = pdf.transform(func)
            # Remaps to the original name, positionally.
            pdf = pdf.rename(
                columns=dict(zip(pdf.columns, return_schema.fieldNames())))
            return pdf

        grouped_map_func = pandas_udf(return_schema,
                                      PandasUDFType.GROUPED_MAP)(rename_output)

        sdf = self._kdf._sdf
        input_groupkeys = [s._scol for s in self._groupkeys]
        sdf = sdf.groupby(*input_groupkeys).apply(grouped_map_func)
        internal = _InternalFrame(sdf=sdf,
                                  data_columns=return_schema.fieldNames(),
                                  index_map=[])  # index is lost.
        return DataFrame(internal)
Exemplo n.º 7
0
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", brokers) \
    .option("subscribe", "direction_amap") \
    .option("startingOffsets", "earliest") \
    .load()

# 抽取value以及timestamp
df = df.selectExpr("CAST(value AS STRING)", "timestamp")

# value解析成JSON
# 解析JSON字段为列
etl_df = df.select(
    from_json("value", direction_schema).alias("parsed_value"), "timestamp")

for name in direction_schema.fieldNames():
    etl_df = etl_df.withColumn(name, col("parsed_value").getField(name))

etl_df = etl_df.drop("parsed_value", "url", "project", "spider", "server",
                     "strategy")
etl_df = etl_df.withColumnRenamed("duration", "total_duration")

etl_df = etl_df.withColumn("selected_path_steps",
                           explode("selected_path_steps"))

for name in selected_path_schema.fieldNames():
    etl_df = etl_df.withColumn(name, col("selected_path_steps").getField(name))

etl_df = etl_df.drop("selected_path_steps")
etl_df = etl_df.select("timestamp", "preset_route", "total_duration", "tmcs")
etl_df = etl_df.withColumn("tmcs", explode("tmcs"))
    StructField("user", StructType([StructField("id", IntegerType(), True)]),
                True),
    StructField("lang", StringType(), True),
    StructField("text", StringType(), True),
    StructField("created_at", StringType(), True)
])
tweetDF = (spark.read.schema(tweetSchema).json(path))

display(tweetDF)

# COMMAND ----------

# TEST - Run this cell to test your solution
from pyspark.sql.functions import col

schema = tweetSchema.fieldNames()
schema.sort()
tweetCount = tweetDF.filter(col("id").isNotNull()).count()

dbTest("ET1-P-08-04-01", 'created_at', schema[0])
dbTest("ET1-P-08-04-02", 'id', schema[1])
dbTest("ET1-P-08-04-03", 1491, tweetCount)

assert schema[0] == 'created_at' and schema[1] == 'id'
assert tweetCount == 1491

print("Tests passed!")

# COMMAND ----------

# MAGIC %md
Exemplo n.º 9
0
    print(source_path.format(
        dataset=dataset_name,
        year=year, month=month
    ))
    schema = schemas[year] if year in schemas else fhv_schema
    raw = spark.read.csv(
        path=source_path.format(
            dataset=dataset_name,
            year=year, month=month
        ),
        schema=schema, header=True,
    )
    # raw.printSchema()
    df = raw
    for field in fhv_schema.fields:
        if field.name not in df.columns:
            df = df.withColumn(field.name, lit(None).astype(field.dataType))
    df = df.select(fhv_schema.fieldNames())\
        .withColumn('year', lit(year).astype(IntegerType()))\
        .withColumn('month', lit(month).astype(IntegerType()))
    # df.printSchema()
    # df.show(vertical=True, n=5)
    df.write.parquet(
        path=destination_path.format(dataset=dataset_name),
        mode="append",
        partitionBy=["year", "month"],
        compression="snappy"
    )

job.commit()