Python DataFrame.select示例，pyspark.sql.dataframe.DataFrame.select Python示例

示例#1

0

显示文件

文件： quality_checking.py 项目： nantsou/udacity-data-engineer

def check_fact_table_integrity(fact: SparkDataFrame,
                               dim_demographics: SparkDataFrame,
                               dim_airports: SparkDataFrame,
                               dim_countries: SparkDataFrame) -> bool:
    """Check the integrity of models based on fact table.

    :param fact: the data frame of fact table, immigration.
    :param dim_demographics: the data frame of dimension demographics table
    :param dim_airports: the data frame of dimension airports table
    :param dim_countries: the data frame of dimension of countries table
    :return: True if all the integrity check pass otherwise False
    """

    integrity_demographics = fact.select(col('i94addr')).distinct()\
                                 .join(dim_demographics,
                                       fact['i94addr'] == dim_demographics['state_code'], 'left_anti').count() == 0

    integrity_airports = fact.select(col('i94port')).distinct()\
                             .join(dim_airports,
                                   fact['i94port'] == dim_airports['local_code'], 'left_anti').count() == 0

    integrity_countries = fact.select(col('i94cit')).distinct().\
                              join(dim_countries, fact['i94cit'] == dim_countries['code'], 'left_anti').count() == 0

    return integrity_demographics & integrity_airports & integrity_countries

示例#2

0

显示文件

def loadIntoRaw(database, table, source: DataFrame):
    # RAW is a bit special since it does not have a fixed schema
    # When writing, we order it to use the same schema as the source dataset
    # ensureParent parameter instructs the data source to create the RAW table if it does not exist already
    destination = cdfRaw(database, table,
                         ensureParent=True).schema(source.schema).load()
    destination.createOrReplaceTempView("destinationRawTable")
    source.select(*destination.columns).write.insertInto("destinationRawTable")

示例#3

0

显示文件

文件： helpers.py 项目： syllogy/feature-factory

    def get_categoricals_multiplier(self,
                                    df: DataFrame,
                                    col_list: list = [],
                                    ignore_cols: list = [],
                                    approx_distinct=100,
                                    rsd=0.05):
        """
        Gets a dictionary of col names and the distinct values in the column.
        :param df:
        :param col_list: Subset list of columns to use as categoricals; if null, all columns will be checked for
        approx_distinct values and considered categoricals
        :param ignore_cols: when not selecting a subset of columns using col_list, ignore columns is a list of
        columns that will be skipped when searching for categoricals with approx_distinct columns.
        :param approx_distinct: log a warning message if the approx number of distinct values is greater than this threshold.
        :param rsd:
        :return:
        """
        # TODO - Add logging of findings
        filter_vals = []
        filter_cols = col_list

        if len(col_list) == 0:
            for (dcol, dtype) in df.drop(*ignore_cols).dtypes:
                if dtype == 'string':
                    if self._get_approx_distinct_count_for_col(
                            df, dcol, _rsd=rsd) <= approx_distinct:
                        # LOG print("{} has approx {} distincts".format(dcol, cnt))
                        # LOG print("appending {}".format(dcol))
                        filter_vals.append(df.select(col(dcol)) \
                                           .filter((col(dcol).isNotNull()) &
                                                   (col(dcol).isin("", "Y", "N") == False)) \
                                           .distinct().rdd.map(lambda row: str(row[0])).collect())
                        filter_cols.append(dcol)
            # ?? TODO - What about the rest of the potential categorical types (i.e. bools/ints/floats/etc)
            return feature_factory.feature.Multiplier.create_from_cats(
                filter_cols, filter_vals)
        else:
            for dcol in col_list:
                if self._get_approx_distinct_count_for_col(
                        df, dcol) > approx_distinct:
                    print("WARN! {} has more than {} distinct values".format(
                        dcol, approx_distinct))
                filter_vals.append(df.select(col(dcol)) \
                                   .filter((col(dcol).isNotNull()) &
                                           (col(dcol).isin("", "Y", "N") == False)) \
                                   .distinct().rdd.map(lambda row: str(row[0])).collect())
            return feature_factory.feature.Multiplier._create_from_cats(
                filter_cols, filter_vals)

示例#4

0

显示文件

    def _struct_df(self, df: DataFrame) -> DataFrame:
        """Struct the output dataframe generated by the reader.

        Under the default "value" column coming from Kafka there are the custom
        fields created by some producer. This function will struct the dataframe as
        to get all desired fields from "value" and insert all Kafka default columns,
        including "value", under "kafka_metadata" column. It is important to notice
        that the declared value_schema suffer from the same effects described in
        explode_json_column method in pre_processing module.

        Args:
            df: direct dataframe output from from KafkaReader.

        Returns:
            Structured dataframe with kafka value fields as columns.
                All other default fields from Kafka will be stored under
                "kafka_metadata" column.

        """
        df = df.withColumn("kafka_metadata", struct(*self.KAFKA_COLUMNS))
        df = explode_json_column(df,
                                 column="value",
                                 json_schema=self.value_schema)
        return df.select([field.name
                          for field in self.value_schema] + ["kafka_metadata"])

示例#5

0

显示文件

文件： TransformExtension.py 项目： jayjayjohn/spark-kafka

 def df_add_cols_literal(self, df: DataFrame, cols: list):
     existing_cols = df.columns
     missing_cols = [col for col in cols if col not in existing_cols]
     missing_cols = list(set(missing_cols))  # remove duplicate
     missing_cols_expr = [lit(None).alias(col) for col in missing_cols]
     all_expr = ["*"] + missing_cols_expr
     return df.select(all_expr)

示例#6

0

显示文件

文件： TransformExtension.py 项目： jayjayjohn/spark-kafka

    def json_flatten(self, df: DataFrame):
        schema = df.schema
        flatten_df = df.select(self.on_flatten(schema))

        flatten_df_agg = self.concat_similar_columns(df=flatten_df)

        return flatten_df_agg

示例#7

0

显示文件

 def remove_illegal_chars(self, dataframe: DataFrame, source_column: str,
                          target_column: str):
     df2 = dataframe.select(
         col('id'),
         translate(col(source_column), f'["".join({self.chars})]',
                   self.replacament).alias(target_column))
     return df2.select('id', 'string_filtered')

示例#8

0

显示文件

 def _unpack_struct(self, df: DataFrame, col_name):
     sub_df = df.select(col_name + '.*')
     for subcol_name in sub_df.columns:
         df = df.withColumn(f'{col_name}_{subcol_name}',
                            df[col_name][subcol_name])
     df = df.drop(col_name)
     return self.unpack_nested(df)

示例#9

0

显示文件

def process_data(df: DataFrame, ml_model: PipelineModel = model) -> DataFrame:
    df = convert_types_for_ml(df)
    df = convert_heroes_to_lineup(df)
    df = ml_model.transform(df)
    df = convert_types_for_es(df)

    return df.select("probability_arr", "radiant_win_prediction",
                     "match_seq_num")

示例#10

0

显示文件

文件： TransformExtension.py 项目： jayjayjohn/spark-kafka

 def mass_regex_replace(self, df: DataFrame, pattern_replacement: list):
     pattern = pattern_replacement[0]
     replacement = pattern_replacement[1]
     cols = df.schema.fieldNames()
     selectExpr = [
         regexp_replace(col(c), pattern, replacement).alias(c) for c in cols
     ]
     return df.select(selectExpr)

示例#11

0

显示文件

def generate_idx_for_df(df: DataFrame, col_name: str, col_schema):
    idx_udf = udf(lambda x: udf_array_to_map(x),
                  MapType(IntegerType(), col_schema, True))
    df = df.withColumn("map", idx_udf(col(col_name)))
    df = df.select("problem_type", "user_id", "oms_protected", "problem_id",
                   "create_at",
                   explode("map").alias("item_id", "answer"))
    return df

示例#12

0

显示文件

文件： unpack_nested_fields.py 项目： daniel-afana/pyspark_homework

 def unpack_nested(self, dataframe: DataFrame):
     columns_to_select = []
     for field in dataframe.schema.fields:
         if type(field.dataType) in (ArrayType, StructType):
             c = explode(field.name).alias('int_array')
         else:
             c = col(field.name)
         columns_to_select.append(c)
     return dataframe.select(*columns_to_select)

示例#13

0

显示文件

def unpack_df_col(
        df: sparkDataFrame,
        col_name: str,
) -> List:

    df = df.withColumnRenamed(col_name, 'col_to_extract')
    list_col_contents = [row.col_to_extract for row in  df.select('col_to_extract').collect()]

    return list_col_contents

示例#14

0

显示文件

def flatten_schema(df: DataFrame):
    """
    :param df: Spark dataframe
    :return: Spark dataframe with flat schema
    """
    # Define the mapping of the column names
    mapping = flat_schema_mapping(df)

    return df.select(*mapping)

示例#15

0

显示文件

文件： tbl_loans.py 项目： daipe-ai/daipe-demo-databricks

def convert_columns_and_save(df: DataFrame):
    date_cols = [c for c in df.columns if "Date" in c and "Till" not in c]
    date_cols.append("ReportAsOfEOD")

    return (df.select(
        *(f.col(c).cast("date").alias(c) if c in date_cols else f.col(c)
          for c in df.columns)).withColumn(
              "ListedOnUTC", f.to_timestamp("ListedOnUTC")).withColumn(
                  "BiddingStartedOn",
                  f.to_timestamp("BiddingStartedOn")).withColumn(
                      "StageActiveSince",
                      f.to_timestamp("StageActiveSince")).withColumnRenamed(
                          "ReportAsOfEOD", "LoanReportAsOfEOD"))

示例#16

0

显示文件

def transform_parking_violation_data(df: DataFrame,
                                     column: str = "Violation County"
                                     ) -> DataFrame:
    """Transforming parking vialation data to make it joinable, below are the things steps in high level

    1. Added Borocode
    2. Converted house number in case it is separated by '-'
    3. Converted 'Street Name' to upper case
    4. Removed any data having no house number
    """

    df = (df.select(
        "Violation County", "House Number", "Street Name", "Summons Number",
        "Issue Date").distinct().withColumn(
            "year",
            F.year(F.to_date(F.col("Issue Date"), "MM/dd/yyyy"))).orderBy(
                "Violation County", "House Number",
                "Street Name", "year").coalesce(100).groupBy(
                    "Violation County", "House Number", "Street Name",
                    "year").agg({
                        "Summons Number": "count"
                    }).withColumnRenamed(
                        "count(Summons Number)", "total_cnt").withColumn(
                            "BOROCODE",
                            F.when(
                                F.col(column).isin([
                                    "MAN", "MH", "MN", "NEWY", "NEW Y", "NY"
                                ]), 1).when(
                                    F.col(column).isin(["BRONX", "BX"]),
                                    2).when(
                                        F.col(column).isin(
                                            ["BK", "K", "KING", "KINGS"]),
                                        3).when(
                                            F.col(column).isin([
                                                "Q", "QN", "QNS", "QU", "QUEEN"
                                            ]), 4).when(
                                                F.col(column).isin(
                                                    ["R", "RICHMOND"]),
                                                5).otherwise(0),
                        ))

    df = (df.filter(F.col("House Number").isNotNull()).withColumn(
        "temp", F.split("House Number", "-")).withColumn(
            "House Number",
            F.col("temp").getItem(0).cast("int") +
            F.when(F.col("temp").getItem(1).isNull(), "0").otherwise(
                F.col("temp").getItem(1)).cast("int") / 1000,
        ).withColumn("temp",
                     F.col("temp").getItem(0).cast("int")).withColumn(
                         "Street Name", F.upper(F.col("Street Name"))))
    return df

示例#17

0

显示文件

文件： interpol.py 项目： databrickslabs/tempo

    def __validate_col(
        self,
        df: DataFrame,
        partition_cols: List[str],
        target_cols: List[str],
        ts_col: str,
    ):
        """
        Validate if target column exists and is of numeric type, and validates if partition column exists.

        :param df: DataFrame to be validated
        :param partition_cols: Partition columns to be validated
        :param target_col: Target column to be validated
        :param ts_col: Timestamp column to be validated
        """

        for column in partition_cols:
            if column not in str(df.columns):
                raise ValueError(
                    f"Partition Column: '{column}' does not exist in DataFrame."
                )
        for column in target_cols:
            if column not in str(df.columns):
                raise ValueError(
                    f"Target Column: '{column}' does not exist in DataFrame.")
            if df.select(
                    column).dtypes[0][1] not in supported_target_col_types:
                raise ValueError(
                    f"Target Column needs to be one of the following types: {supported_target_col_types}"
                )

        if ts_col not in str(df.columns):
            raise ValueError(
                f"Timestamp Column: '{ts_col}' does not exist in DataFrame.")

        if df.select(ts_col).dtypes[0][1] != "timestamp":
            raise ValueError(
                f"Timestamp Column needs to be of timestamp type.")

示例#18

0

显示文件

文件： json_transform.py 项目： vrock28/butterfree

def json_transform(dataframe: DataFrame) -> DataFrame:
    """Filters DataFrame's rows using the given condition and value.

    Args:
        dataframe: Spark DataFrame.

    Returns:
        Converted dataframe.
    """
    return dataframe.select(
        to_json(
            struct([dataframe[column] for column in dataframe.columns])  # type: ignore
        ).alias("value")
    )

示例#19

0

显示文件

文件： Transform.py 项目： jayjayjohn/spark-kafka

    def orbat_normalize(self, df: DataFrame, mapping_sheet: str):
        mapping_dict = self.transformExtension.parse_mapping_sheet(
            mapping_str=mapping_sheet)
        selectExpr = []
        for field in df.schema.fields:
            name = field.name
            dtype = field.dataType
            if isinstance(dtype, (ArrayType, StructType, MapType)):
                selectExpr.append(to_json(col(name)).alias(name))
            else:
                selectExpr.append(col(name).alias(name))
        df = df.select(selectExpr)
        mapped_df = self.transformExtension.coalesce_map_col(
            df=df, mapping_dict=mapping_dict)

        return mapped_df

示例#20

0

显示文件

文件： framework_data_frame_analyzer.py 项目： imranq2/SparkPipelineFramework

    def _transform(self, df: DataFrame) -> DataFrame:
        view: str = self.getView()
        analysis_views_prefix: Optional[str] = self.getAnalysisViewsPrefix()
        output_folder: Optional[Union[Path, str]] = self.getOutputFolder()
        columns_to_analyze: Optional[List[str]] = self.getColumnsToAnalyze()
        columns_to_skip: Optional[List[str]] = self.getColumnsToSkip()
        progress_logger: Optional[ProgressLogger] = self.getProgressLogger()

        # get columns in data frame
        df = df.sql_ctx.table(view)

        if columns_to_analyze:
            columns_to_analyze = [
                c for c in df.columns if c in columns_to_analyze
            ]
        else:
            columns_to_analyze = df.columns

        assert columns_to_analyze

        if columns_to_skip:
            columns_to_analyze = [
                c for c in columns_to_analyze if c not in columns_to_skip
            ]

        column_name: str
        for column_name in columns_to_analyze:
            result_df: DataFrame = (
                df.select(column_name).groupBy(column_name).count().orderBy(
                    col("count").desc()))
            if output_folder:
                target_path: str = str(
                    os.path.join(str(output_folder), column_name))
                if progress_logger:
                    progress_logger.write_to_log(
                        f"Writing analysis for column {column_name} to {target_path}"
                    )
                result_df.coalesce(1).write.csv(target_path,
                                                header=True,
                                                mode="overwrite")
            if analysis_views_prefix:
                result_df.createOrReplaceTempView(
                    f"{analysis_views_prefix}{column_name}"
                    if analysis_views_prefix.endswith(
                        "_") else f"{analysis_views_prefix}_{column_name}")

        return df

示例#21

0

显示文件

def transform_read_centerline_data(df: DataFrame) -> DataFrame:
    """Transforming centerline data to make it joinable, below are the things steps in high level

    1. Converted ST_LABEL & FULL_STREE to upper case
    2. Converted L_LOW_HN & L_HIGH_HN  separated by '-' for odd house number
    3. Converted R_LOW_HN & R_HIGH_HN  separated by '-' for even house number
    4. Removed any data having no house number in L_LOW_HN and R_LOW_HN
    """
    df = (df.select("PHYSICALID", "BOROCODE", "FULL_STREE", "ST_NAME",
                    "L_LOW_HN", "L_HIGH_HN", "R_LOW_HN", "R_HIGH_HN").orderBy(
                        "PHYSICALID", "BOROCODE", "FULL_STREE", "ST_NAME",
                        "L_LOW_HN", "L_HIGH_HN", "R_LOW_HN",
                        "R_HIGH_HN").coalesce(200).withColumn(
                            "ST_NAME", F.upper(F.col("ST_NAME"))).withColumn(
                                "FULL_STREE",
                                F.upper(F.col("FULL_STREE"))).filter(
                                    (F.col("L_LOW_HN").isNotNull())
                                    | (F.col("R_LOW_HN").isNotNull())))
    df = df.withColumn("L_TEMP_ODD", F.split("L_LOW_HN", "-")).withColumn(
        "L_LOW_HN",
        F.col("L_TEMP_ODD").getItem(0).cast("int") +
        F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise(
            F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000,
    )

    df = df.withColumn("L_TEMP_ODD", F.split("L_HIGH_HN", "-")).withColumn(
        "L_HIGH_HN",
        F.col("L_TEMP_ODD").getItem(0).cast("int") +
        F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise(
            F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000,
    )

    df = df.withColumn("L_TEMP_ODD", F.split("R_LOW_HN", "-")).withColumn(
        "R_LOW_HN",
        F.col("L_TEMP_ODD").getItem(0).cast("int") +
        F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise(
            F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000,
    )

    df = df.withColumn("L_TEMP_ODD", F.split("R_HIGH_HN", "-")).withColumn(
        "R_HIGH_HN",
        F.col("L_TEMP_ODD").getItem(0).cast("int") +
        F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise(
            F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000,
    )

    return df

示例#22

0

显示文件

文件： truth.py 项目： uk-gov-mirror/moj-analytical-services.splink

def _truth_space_table_old(
    df_labels_with_splink_scores: DataFrame,
    spark: SparkSession,
    threshold_actual: float = 0.5,
    score_colname: str = None,
):
    """Create a table of the ROC space i.e. truth table statistics
    for each discrimination threshold

    Args:
        df_labels_with_splink_scores (DataFrame): A dataframe of labels and associated splink scores
            usually the output of the truth.labels_with_splink_scores function
        threshold_actual (float, optional): Threshold to use in categorising clerical match
            scores into match or no match. Defaults to 0.5.
        score_colname (float, optional): Allows user to explicitly state the column name
            in the Splink dataset containing the Splink score.  If none will be inferred

    Returns:
        DataFrame: Table of 'truth space' i.e. truth categories for each threshold level
    """

    # This is used repeatedly to generate the roc curve
    df_labels_with_splink_scores.persist()

    # We want percentiles of score to compute
    score_colname = _get_score_colname(df_labels_with_splink_scores,
                                       score_colname)

    percentiles = [x / 100 for x in range(0, 101)]

    values_distinct = df_labels_with_splink_scores.select(
        score_colname).distinct()
    thresholds = values_distinct.stat.approxQuantile(score_colname,
                                                     percentiles, 0.0)
    thresholds.append(1.01)
    thresholds = sorted(set(thresholds))

    roc_dfs = []
    for thres in thresholds:
        df_e_t = df_e_with_truth_categories(df_labels_with_splink_scores,
                                            thres, spark, threshold_actual,
                                            score_colname)
        df_roc_row = _summarise_truth_cats(df_e_t, spark)
        roc_dfs.append(df_roc_row)

    all_roc_df = reduce(DataFrame.unionAll, roc_dfs)
    return all_roc_df

示例#23

0

显示文件

文件： flat_file.py 项目： dckc/naaccr-tumor-data

def naaccr_read_fwf(flat_file: DataFrame,
                    record_layout: DataFrame,
                    value_col: str = 'value',
                    exclude_pfx: str = 'reserved') -> DataFrame:
    """
    @param flat_file: as from spark.read.text()
                      typically with .value
    @param record_layout: as from http://datadictionary.naaccr.org/?c=7
                          with .start, .length, .xmlId
    """
    fields = [
        func.substring(flat_file[value_col], item.start,
                       item.length).alias(item.xmlId)
        for item in record_layout.collect()
        if not item.xmlId.startswith(exclude_pfx)
    ]  # type: List[Union[Column, str]]
    return flat_file.select(fields)

示例#24

0

显示文件

def countNullsAcrossAllColumns(df: DataFrame) -> int:
    """Given a spark dataframe count the total number of cells with a null in them

    Args:
        df (DataFrame): [Spark dataframe]

    Returns:
        int: [Total number of cells with a null value]
    """
    # https://www.datasciencemadesimple.com/count-of-missing-nanna-and-null-values-in-pyspark/
    from pyspark.sql.functions import isnull, when, count, expr

    nullCountDf = df.select(
        [count(when(isnull(c), c)).alias(c) for c in df.columns])
    sumExpr = "+".join(nullCountDf.columns) + " as TOTAL"
    sumDf = nullCountDf.select(expr(sumExpr))
    return sumDf.collect()[0].TOTAL

示例#25

0

显示文件

def countWSAcrossAllStringColumns(df: DataFrame) -> int:
    """Given a spark dataframe count the Total number of cells with a BLANK or all SPACES value

    Args:
        df (DataFrame): [Spark dataframe]

    Returns:
        int: [Total number of cells with a BLANK or all SPACES value]
    """
    from pyspark.sql.functions import col, when, count, trim, expr

    stringCols = [cn for (cn, ct) in df.dtypes if ct == "string"]
    blanksCountdf = df.select(
        [count(when(trim(col(c)) == "", True)).alias(c) for c in stringCols])
    sumExpr = "+".join(blanksCountdf.columns) + " as TOTAL"
    sumDf = blanksCountdf.select(expr(sumExpr))
    return sumDf.collect()[0].TOTAL

示例#26

0

显示文件

文件： traversal.py 项目： dylan-profiler/visions

def _traverse_graph_spark_dataframe(
    df: DataFrame, root_node: T, graph: nx.DiGraph
) -> Tuple[DataFrame, Dict[str, List[T]], Dict[str, dict]]:
    inferred_values = {
        col: traverse_graph_with_series(root_node, df.select(col), graph)
        for col in df.columns
    }

    inferred_series = {}
    inferred_paths: Dict[str, List[T]] = {}
    inferred_states: Dict[str, dict] = {}
    for col, (inf_series, inf_path, inf_state) in inferred_values.items():
        assert isinstance(inf_path, list)  # Placate the MyPy Gods

        inferred_series[col] = inf_series
        inferred_paths[col] = inf_path
        inferred_states[col] = inf_state

    # note inference disabled, return df
    return df, inferred_paths, inferred_states

示例#27

0

显示文件

文件： TransformExtension.py 项目： jayjayjohn/spark-kafka

    def concat_similar_columns(self, df: DataFrame):
        """
        replace - to _ in column name,
        concat all similar column with ' | '

        :param df:
        :return:
        """
        fields = df.schema.fieldNames()
        pandas_df = pandas.DataFrame(data={'field': fields})
        pandas_df['groupfiled'] = pandas_df.apply(
            func=lambda r: r['field'].replace('-', '_'), axis=1)
        group_ser = pandas_df.groupby('groupfiled',
                                      as_index=False)['field'].apply(list)
        group_fileds = group_ser.to_list()
        SEP = ' | '
        selectExpr = [
            concat_ws(SEP, *[col(f) for f in fields]).alias(fields[0])
            for fields in group_fileds
        ]
        return df.select(selectExpr)

示例#28

0

显示文件

    def __check_labels(sdf: SparkDataFrame, column_name: str,
                       label1: Optional[str], label2: Optional[str]) -> bool:
        r""" checks whether labels meet required conditions for method `execute`.

        Called only when category = 'boolean'

        Args:
            sdf: Spark DataFrame object
            column_name: specific column which is to be executed using 'Binary' mechanism
            label1: label to be used by 'Binary' mechanism
            label2: label to be used by 'Binary' mechanism

        Returns: True if parameters satisfy the conditions

        Raises: TypeError, ValueError if parameters do not obey the rules

        """
        if not isinstance(label1, str) or not isinstance(label2, str):
            raise TypeError("Labels must be strings.")

        if len(label1) == 0 or len(label2) == 0:
            raise ValueError("Labels must be non-empty strings")

        if label1 == label2:
            raise ValueError("Labels must not match")

        # finds unique values in a column
        labels: List[str] = [
            row[column_name]
            for row in sdf.select(column_name).distinct().collect()
        ]

        if len(labels
               ) is not 2 or label1 not in labels or label2 not in labels:
            # checks whether all the rows of column have either label1 or label2
            raise ValueError(
                "Labels in column `%s` does not match with labels entered" %
                column_name)

        return True

示例#29

0

显示文件

文件： DataWriter.py 项目： daipe-ai/datalake-bundle

 def append(self, df: DataFrame, full_table_name: str, schema: StructType, options: dict):
     # insertInto() requires dataframe columns order to match schema columns order
     df.select([field.name for field in schema.fields]).write.options(**options).insertInto(full_table_name, overwrite=False)

示例#30

0

显示文件

文件： helpers.py 项目： syllogy/feature-factory

 def _get_approx_distinct_count_for_col(self,
                                        df: DataFrame,
                                        _dcol: str,
                                        _rsd=0.05):
     return df.select(F.approx_count_distinct(col(_dcol), rsd=_rsd)) \
         .rdd.map(lambda row: row[0]).collect()[0]