Python sql.dataframe.DataFrame.withColumnRenamed 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyspark

메소드/함수: withColumnRenamed

hotexamples.com에서의 예제들: 4

Python sql.dataframe.DataFrame.withColumnRenamed - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyspark.sql.dataframe.DataFrame.withColumnRenamed에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

withColumn(10)

select(9)

join(8)

filter(6)

count(5)

withColumnRenamed(4)

toPandas(3)

agg(2)

groupBy(2)

createOrReplaceTempView(2)

groupby(2)

cache(1)

limit(1)

query(1)

repartition(1)

fillna(1)

drop(1)

예제 #1

파일 보기

def clean_dist_df(
    dist_df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    # filter data
    dist_df = dist_df.select("Name", "Facings", "Capacity", 'Days Supply',
                             'Classification', 'Mat ID', '# POGs')

    ### Rename column
    dist_df = dist_df.withColumnRenamed("Name", "SKU")
    dist_df = dist_df.withColumnRenamed("Days Supply", "DaysSupply")
    dist_df = dist_df.withColumnRenamed("Mat ID", "MatID")
    dist_df = dist_df.withColumnRenamed("# POGs", "POGS")

    # Conver columns to `FloatType()`
    dist_df = dist_df.withColumn("Facings", dist_df.Facings.cast('float'))
    dist_df = dist_df.withColumn("Capacity", dist_df.Capacity.cast('float'))
    dist_df = dist_df.withColumn("DaysSupply",
                                 dist_df.DaysSupply.cast('float'))
    dist_df = dist_df.withColumn("MatID", dist_df.MatID.cast('integer'))
    dist_df = dist_df.withColumn("POGS", dist_df.POGS.cast('integer'))
    return dist_df

예제 #2

파일 보기

파일: Rank_analysis_helperfunction.py 프로젝트: YiranJing/Lagardere_CommercialAnalysis

def clean_dataset(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:

    ## Select the target features
    df = df.select('Index', 'Date Detail', 'Company', 'Business Unit',
                   'Concept_NEW', 'Product Category',
                   'Company and Cost Centre', 'SKU', 'POS Net Sales',
                   'Rank Total')

    ## Reanme columns
    df = df.withColumnRenamed("POS Net Sales", "NetSales")
    df = df.withColumnRenamed("Date Detail", "Date")
    df = df.withColumnRenamed("Product Category", "Category")
    df = df.withColumnRenamed("Company and Cost Centre", "Store")
    df = df.withColumnRenamed("Business Unit", "BusinessUnit")
    df = df.withColumnRenamed("Rank Total", "rank")

    ## Column type cast
    columns = ['NetSales', 'rank']
    df = convertColumn(df, columns, FloatType())
    # Replace none to 0
    df = df.na.fill(0)
    return df

예제 #3

파일 보기

파일: OOS_helperfun.py 프로젝트: YiranJing/Lagardere_CommercialAnalysis

def clean_data(df: pyspark.sql.dataframe.DataFrame,
               spark: pyspark.sql.session.SparkSession):
    """
    Apply data processing. 
        1)  Rename columns name
        2)  Columns type cast
        3)  Remove the closed store
        4)  Short SKU name by removing itemID
        5)  Remove items if no sales in the whole month, since they are not OOS
        6)  Remove items if no stock in the whole month, since they are not OOS
        7)  Add more rows to ensure each item in each store has the full-month records
        8)  Replace none to 0
        9)  Convert float number between -1 and 1 to 0
        10)  Save the cleaned dataset
    """

    ### 1)  Rename column
    df = df.withColumnRenamed("POS Margin on Net Sales", "Margin")
    df = df.withColumnRenamed("POS Net Sales", "NetSales")
    df = df.withColumnRenamed("Stock Balance Qty", "StockQty")
    df = df.withColumnRenamed("POS Qty Sold", "QtySold")

    # 2)  Conver the `df` columns to `FloatType()`
    columns = ['NetSales', 'QtySold', 'Margin', 'StockQty']
    df = convertColumn(df, columns, FloatType())
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMMdd"))

    # 3)  Remove the closed store
    df = remove_closed_store(df)

    # 4)  Short SKU name by removing itemID
    """
    short_column_udf = udf(lambda name: short_column(name), StringType())
    count = df.count()
    df = df.withColumn("SKU", short_column_udf(col("SKU")))
    assert df.count() == count, "Some error here" # test on overall dataset
    print(df.count())
    """

    # 5)  Remove items if no sales in the whole month, since they are not OOS
    df = remove_no_sale_item(df)

    # 6)  Remove items if no stock in the whole month, since they are not OOS
    df = remove_no_stock_item(df)

    # 7)  Add more rows to ensure each item in each store has the full-month records
    date_generated = create_list_dates(df)
    df = clean_and_add_date(df, date_generated, spark)

    # 8)  Replace none to 0
    df = df.fillna(0)

    # 9)  convert float number between -1 and 1 to 0
    #clean_numeric_column_udf = udf(lambda name: clean_numeric_column(name), FloatType())
    #df = df.withColumn("StockQty", clean_numeric_column(col("StockQty")))

    # 10)  save the cleaned dataset, overwrite the old one.
    #df.coalesce(1).write.option("header", "true").mode('overwrite').csv("../data/cleanedData") # only specify folder name
    print("Data processing finished.")

    return df, date_generated

예제 #4

파일 보기

def Data_clean_and_merge(
        df: pyspark.sql.dataframe.DataFrame,
        Subcat_info: pyspark.sql.dataframe.DataFrame,
        Vendor_info: pyspark.sql.dataframe.DataFrame,
        store_name: str,
        begin_date="2019-04-01",
        end_date="2019-10-01") -> pyspark.sql.dataframe.DataFrame:
    # select useful columns
    Subcat_info = Subcat_info.select('SKU', 'SubCategory')
    Vendor_info = Vendor_info.select('SKU', 'Vendor')
    # clean data entry: remove ID
    split_col = split(Subcat_info['SKU'], '-')
    Subcat_info = Subcat_info.withColumn('MatID', split_col.getItem(0))
    Subcat_info = Subcat_info.withColumn('MatID',
                                         regexp_replace(
                                             col("MatID"), "[ZNDF]",
                                             ""))  # remove letters from matID

    split_col2 = split(Vendor_info['SKU'], '-')
    Vendor_info = Vendor_info.withColumn('MatID', split_col2.getItem(0))
    Vendor_info = Vendor_info.withColumn('MatID',
                                         regexp_replace(
                                             col("MatID"), "[ZNDF]",
                                             ""))  # remove letters from matID

    split_col = split(Subcat_info['SubCategory'], '-')
    split_col2 = split(Vendor_info['Vendor'], '-')
    Subcat_info = Subcat_info.withColumn('SubCategory', split_col.getItem(1))
    Vendor_info = Vendor_info.withColumn('Vendor', split_col2.getItem(1))
    # filter data
    df = df.select("Date", "Store", 'item', 'POS Gross Sales', 'POS Net Sales',
                   'POS Total Discount', 'POS Qty Sold', 'POS COGS (INV)')

    # Check only one store
    df = df.filter(df.Store == store_name)

    # Remove comma from integer (e.g. 1,333 to 1333)
    udf = UserDefinedFunction(lambda x: re.sub(',', '', x), StringType())
    #num_columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS']
    df = df.select(*[udf(column).alias(column) for column in df.columns])

    # filter data, and keep only half years
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM"))
    df = df.filter(df.Date >= begin_date)
    df = df.filter(df.Date < end_date)  # April - Sep

    # separate Item name to SKU and ID
    split_col = split(df['item'], '-')
    df = df.withColumn('MatID', split_col.getItem(0))
    df = df.withColumn('MatID',
                       regexp_replace(col("MatID"), "[ZNDF]",
                                      ""))  # remove letters from matID
    df = df.withColumn('SKU', split_col.getItem(1))

    ### Rename column
    df = df.withColumnRenamed("Sales Type", "SalesType")
    df = df.withColumnRenamed("POS Gross Sales", "GrossSales")
    df = df.withColumnRenamed("POS Net Sales", "NetSales")
    df = df.withColumnRenamed("POS Total Discount", "TotalDiscount")
    df = df.withColumnRenamed("POS Qty Sold", "QtySold")
    df = df.withColumnRenamed("POS COGS (INV)", "COGS")

    # Assign all column names to `columns`
    columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS']
    # Conver the `df` columns to `FloatType()`
    df = convertColumn(df, columns, FloatType())

    # drop unnecessary items
    columns_to_drop = ['item']
    df = df.drop(*columns_to_drop)
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM"))

    # Create the new columns
    df = df.withColumn("Price", df.GrossSales / df.QtySold)
    df = df.withColumn("FrontMargin", (df.GrossSales + df.COGS))
    df = df.withColumn("SellMargin", (df.NetSales + df.COGS))

    # add subcategory column
    df = df.join(Subcat_info.select("MatID", 'SubCategory'),
                 on=["MatID"],
                 how="left")
    df = df.join(Vendor_info.select("MatID", 'Vendor'),
                 on=["MatID"],
                 how="left")
    return df