def format_output(df): df = df.withColumn("uniqueKey", f.upper(f.concat(f.lit("RY"), f.substring(f.col('year'), 3, 2), f.lit("_"), f.col("channel"), f.lit("_"), f.col("division"), f.lit("_"), f.col("gender"), f.lit("_"), f.col("category"), ))) \ .withColumn("channel", f.upper(f.col("channel"))) \ .withColumn("year", f.concat(f.lit("RY"), f.substring(f.col('year'), 3, 2))) \ .withColumn("week_1", f.concat(f.lit("W"), f.col("week"))) output = df.orderBy("week").groupBy('uniqueKey', 'division', 'gender', 'category', 'channel', 'year').agg( f.to_json( f.collect_list( f.create_map('week_1', 'netSales') ) ).alias('Net Sales'), f.to_json( f.collect_list( f.create_map('week_1', 'salesUnits') ) ).alias('Sales Units') ) return output
def task_a_3_step_3_final(spark): result = kafka_source(spark, config.BOOTSTRAP_SERVERS, "popular-topics-by-country_step-2").parse_json(a3_struct_common) \ .withWatermark("timetamp_start", "1 minute").groupBy( "timetamp_start", "timetamp_end" ).agg( F.collect_list( F.create_map( [ "country_name", F.create_map( [ "topic_name_exp", "topic_sum" ] ) ] ) ).alias("statistics") ).select( F.struct( F.concat(F.hour('timetamp_start'), lit(":"), F.minute('timetamp_start')).alias("time_start"), F.concat(F.hour('timetamp_end'), lit(":"), F.minute('timetamp_end')).alias("time_end"), col('statistics') ).alias("res") ).send_to_kafka(config.BOOTSTRAP_SERVERS, "popular-topics-by-country", config.LOG_PREFIX) return result
def feature_convert_id(self): user_data = self.spark.sql(UserSql) item_data = self.spark.sql(ItemSql) uid_id = self.spark.sql(uid2id) user_data = user_data.join(uid_id, ['uid'], "inner").withColumnRenamed( "id", "uidIndex") user_data = uid_id.join(user_data, uid_id.uid == user_data.subeventid, "inner").drop(uid_id.uid).withColumnRenamed( "id", "subeventidIndex") item_data = item_data.join(uid_id, uid_id.uid == item_data.tuid, "inner").drop(uid_id.uid).withColumnRenamed( "id", "tuidIndex") # for feature in ["uid", "age", "workid", "height", "sex"]: # user_data, item_data = labelEncoderExample(user_data, item_data, feature) # user_data, item_data = labelEncoderExample(user_data, item_data, "uid") # uid特征哈希处理 # user_data = user_data.withColumn("uidIndex", F.col("uid").cast(IntegerType()) % 500000) # user_data = user_data.withColumn("subeventidIndex", F.col("subeventid").cast(IntegerType()) % 500000) # item_data = item_data.withColumn("tuidIndex", F.col("tuid").cast(IntegerType()) % 500000) # 年龄转化 age_mapping_expr = F.create_map( [F.lit(x) for x in chain(*self.age_dict.items())]) user_data = user_data.withColumn( "ageIndex", age_mapping_expr.getItem(F.col("age"))) item_data = item_data.withColumn( "tageIndex", age_mapping_expr.getItem(F.col("tage"))) # 身高转化 height_mapping_expr = F.create_map( [F.lit(x) for x in chain(*self.height_dict.items())]) user_data = user_data.withColumn( "heightIndex", height_mapping_expr.getItem(F.col("height"))) item_data = item_data.withColumn( "theightIndex", height_mapping_expr.getItem(F.col("theight"))) # 工作地点转化 workid_mapping_expr = F.create_map( [F.lit(x) for x in chain(*self.workid_dict.items())]) user_data = user_data.withColumn( "workidIndex", workid_mapping_expr.getItem(F.col("workid"))) item_data = item_data.withColumn( "tworkidIndex", workid_mapping_expr.getItem(F.col("tworkid"))) # 性别转化 sex_mapping_expr = F.create_map( [F.lit(x) for x in chain(*self.sex_dict.items())]) user_data = user_data.withColumn( "sexIndex", sex_mapping_expr.getItem(F.col("sex"))) item_data = item_data.withColumn( "tsexIndex", sex_mapping_expr.getItem(F.col("tsex"))) user_data, item_data = someColumnRenamed(user_data, item_data) return user_data, item_data
def _mark_as_lit(data, data_type): # To support nested types, 'data_type' is required. assert data_type is not None if data is None: return f.lit(data).cast(data_type) if isinstance(data_type, ArrayType): assert isinstance(data, list) # Sadly you cannot create a literal from just an array in pyspark return f.array([_mark_as_lit(x, data_type.elementType) for x in data]) elif isinstance(data_type, StructType): assert isinstance(data, tuple) and len(data) == len(data_type.fields) # Sadly you cannot create a literal from just a dict/tuple in pyspark children = zip(data, data_type.fields) return f.struct([ _mark_as_lit(x, fd.dataType).alias(fd.name) for x, fd in children ]) elif isinstance(data_type, DateType): # Due to https://bugs.python.org/issue13305 we need to zero pad for years prior to 1000, # but this works for all of them dateString = data.strftime("%Y-%m-%d").zfill(10) return f.lit(dateString).cast(data_type) elif isinstance(data_type, MapType): assert isinstance(data, dict) # Sadly you cannot create a literal from just a dict/tuple in pyspark col_array = [] for k in data: col_array.append(_mark_as_lit(k, data_type.keyType)) col_array.append(_mark_as_lit(data[k], data_type.valueType)) return f.create_map(*col_array) else: # lit does not take a data type so we might have to cast it return f.lit(data).cast(data_type)
def map_col(spark, df, datafolder, map_col_name, df_col_name, new_col_name): """ Map the mapping of a simple csv file with key-value structure to a new column in the dataframe matching the keys. Parameters ---------- spark: SparkSession df : spark dataframe The file containing the df_col_name to be used for mapping. datafolder : str Folder location of the csv file to be used for mapping. map_col_name : str The column name of the mapping file. df_col_name : str The column name in the Spark dataframe to be used. new_col_name : str New column name of the mapping results. """ df_map = spark_read_csv(spark, datafolder, f'{map_col_name}.csv') df_map = df_map.toPandas() id_col = f'{map_col_name}_id' dic_map = dict(zip(df_map[id_col], df_map[map_col_name])) mapping_expr = F.create_map([F.lit(x) for x in chain(*dic_map.items())]) return df.withColumn(new_col_name, mapping_expr[F.col(df_col_name)])
def correct_country_names( df: DataFrame, country_col: str, country_mapping_path: str, ) -> DataFrame: """ Replace corrupted country values with true ones. :param df: dataframe including country_name column :param country_col: Column name of country :param country_mapping_path: Path of mapping config :return: dataframe including country_name columns """ column = country_col replace_dict = provide_config(country_mapping_path) corrupted_values = list(replace_dict.keys()) map_col = create_map([lit(x) for x in chain(*replace_dict.items())]) df = df.withColumn(column, F.regexp_replace(column, '"', '')) df = df.withColumn( column, F.when(F.col(column).isin(corrupted_values), map_col[df[column]]).otherwise(F.col(column))) df = df.filter(F.col(column).isNotNull()) df = df.drop_duplicates() logging.info("Corrupted country columns are replaced with true values") return df
def compute_score(self, aux, df_records, tol=15): """ Compute scoreboard of auxiliary information aux inside record df_records. Both must be spark dataframes. Returns a spark dataframe. """ s = aux.groupby('custId').count().take(1)[0][1] #mapping = {n:binom_cdf(p=tol/self.nb_combination, s=s)(n) for n in range(0,self.max_nb_review_per_cust+1)} mapping_2 = { n: proba_2(p=tol / self.nb_combination, s=s)(n) for n in range(0, self.max_nb_review_per_cust + 1) } #mapping_expr = create_map([lit(x) for x in chain(*mapping.items())]) mapping_expr_2 = create_map( [lit(x) for x in chain(*mapping_2.items())]) merged = broadcast(prepare_join(aux, '_1', True)).crossJoin( prepare_join(df_records, '_2', True)) merged = merged.withColumn('similarity', self.similarity_func(merged)) #merged = merged.withColumn('value', 1/F.log(F.log(merged.nbCustReviews_2+100)) * merged.similarity) #merged = merged.withColumn('value', 1/F.log(merged.nbMovieReviews_2) * merged.value) #merged = merged.withColumn('value', binom_cdf_udf(merged.nbCustReviews_2) * merged.similarity) merged = merged.withColumn( 'value', mapping_expr_2.getItem(col('nbCustReviews_2')) * merged.similarity) #merged = merged.withColumn('value', merged.similarity) merged = merged.groupBy('custId_1', 'custId_2', 'movieId_1').max('value') merged = merged.withColumnRenamed('max(value)', 'value') merged = merged.groupBy('custId_1', 'custId_2').sum('value') merged = merged.withColumnRenamed('sum(value)', 'value') return merged
def test_create_glue_table_parquet(session, bucket, database, compression, partition_by): path = "data_samples/nano.csv" schema = "id INTEGER, name STRING, value DOUBLE, date DATE, time TIMESTAMP" timestamp_format = "yyyy-MM-dd" dataframe = session.spark.read_csv(path=path, schema=schema, timestampFormat=timestamp_format, dateFormat=timestamp_format, header=True) dataframe = dataframe \ .withColumn("my_array", array(lit(0), lit(1))) \ .withColumn("my_struct", struct(lit("text").alias("a"), lit(1).alias("b"))) \ .withColumn("my_map", create_map(lit("k0"), lit(1.0), lit("k1"), lit(2.0))) s3_path = f"s3://{bucket}/test" dataframe.write \ .mode("overwrite") \ .format("parquet") \ .partitionBy(partition_by) \ .save(compression=compression, path=s3_path) session.spark.create_glue_table(dataframe=dataframe, file_format="parquet", partition_by=partition_by, path=s3_path, compression=compression, database=database, table="test", replace_if_exists=True) query = "select count(*) as counter from test" pandas_df = session.pandas.read_sql_athena(sql=query, database=database) assert pandas_df.iloc[0]["counter"] == 5 query = "select my_array[1] as foo, my_struct.a as boo, my_map['k0'] as bar from test limit 1" pandas_df = session.pandas.read_sql_athena(sql=query, database=database) assert pandas_df.iloc[0]["foo"] == 0 assert pandas_df.iloc[0]["boo"] == "text" assert pandas_df.iloc[0]["bar"] == 1.0
def convert_endpoint_to_site(dataset, src_col, dst_col): """ Convert src/dst hostname to the respective site names. :return: dataset """ import requests # , re from pyspark.sql.functions import col, create_map, lit from itertools import chain # retrieve mapping cric_url = "http://wlcg-cric.cern.ch/api/core/service/query/?json&type=SE" r = requests.get(url=cric_url).json() site_protocols = {} for site, info in r.items(): if "protocols" in info: # print(se, type(se), info, type(info)) for name, prot in info.get('protocols', {}).items(): site_protocols.setdefault(get_hostname(prot['endpoint']), site) # apply mapping mapping_expr = create_map([lit(x) for x in chain(*site_protocols.items())]) out_cols = dataset.columns dataset = dataset.withColumnRenamed(src_col, "src") dataset = dataset.withColumnRenamed(dst_col, "dst") dataset = dataset.withColumn(src_col, mapping_expr[dataset["src"]]) \ .withColumn(dst_col, mapping_expr[dataset["dst"]]) return (dataset.select(out_cols))
def svod(default_schema, snapshot_date): print( "Selecting rows from source table DATASCIENCEsvod_segmentation_master") seg_source = spark.sql( "SELECT * FROM {}.svod_segmentation_master where snapshot_date='{}'". format(default_schema, snapshot_date)) read_count = seg_source.count() print("Read count is {} ".format(read_count)) print("Generating the map ".format(seg_source.count())) svod_df = seg_source.select( "userid", "subscription_id", "segment_name", create_map(lit('For the Family'), col('prob_segment_0'), lit('Drama Watchers'), col('prob_segment_1'), lit('Anime Fans'), col('prob_segment_2'), lit('Broadcast Generalists'), col('prob_segment_3'), lit('Reality Watchers'), col('prob_segment_4'), lit('Comedy Watchers'), col('prob_segment_5'), lit('Exclusive / Prestige'), col('prob_segment_6'), lit('Content Miners / Film Buffs'), col('prob_segment_7')).alias("prob"), "snapshot_date") return svod_df
def get_weighted_dataframe(df, doGen, resonance, era, subEra, shift=None): ''' Produces a dataframe with a weight and weight2 column with weight corresponding to: 1 for data or pileup for mc The optional shift parameter allows for a different systematic shift to the weights ''' # TODO: implement systematic shifts in the weight such as PDF, pileup, etc. # get the pileup pileup_ratio, pileup_edges = get_pileup(resonance, era, subEra) # build the weights (pileup for MC) # TODO: if there is a weight column (ie, gen weight) get that first if doGen: pileupMap = {e: r for e, r in zip(pileup_edges[:-1], pileup_ratio)} mapping_expr = F.create_map( [F.lit(x) for x in itertools.chain(*pileupMap.items())]) weightedDF = df.withColumn( 'weight', mapping_expr.getItem(F.col('tag_nVertices'))) else: weightedDF = df.withColumn('weight', F.lit(1.0)) weightedDF = weightedDF.withColumn( 'weight2', F.col('weight') * F.col('weight')) return weightedDF
def create_exprs(_input_col, _buckets, _func): def count_exprs(_exprs): return F.sum(F.when(_exprs, 1).otherwise(0)) _exprs = [] for i, b in enumerate(_buckets): lower = b["lower"] upper = b["upper"] if is_numeric(lower): lower = round(lower, 2) if is_numeric(upper): upper = round(upper, 2) if len(_buckets) == 1: count = count_exprs((_func(_input_col) == lower)) else: if i == len(_buckets): count = count_exprs((_func(_input_col) > lower) & (_func(_input_col) <= upper)) else: count = count_exprs((_func(_input_col) >= lower) & (_func(_input_col) < upper)) info = F.create_map(F.lit("count"), count.cast("int"), F.lit("lower"), F.lit(lower), F.lit("upper"), F.lit(upper)).alias("hist_agg" + "_" + _input_col + "_" + str(b["bucket"])) _exprs.append(info) _exprs = F.array(*_exprs).alias("hist" + _input_col) return _exprs
def percentile_agg(col_name, df, values, relative_error): """ Return the percentile of a dataframe :param col_name: '*', list of columns names or a single column name. :param df: :param values: list of percentiles to be calculated :param relative_error: If set to zero, the exact percentiles are computed, which could be very expensive. 0 to 1 accepted :return: percentiles per columns """ # Make sure values are double if values is None: values = [0.05, 0.25, 0.5, 0.75, 0.95] values = val_to_list(values) values = list(map(str, values)) if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): # Get percentiles p = F.expr( "percentile_approx(`{COLUMN}`, array({VALUES}), {ERROR})".format( COLUMN=col_name, VALUES=" , ".join(values), ERROR=relative_error)) # Zip the arrays expr = [[F.lit(v), p.getItem(i)] for i, v in enumerate(values)] expr = F.create_map(*list(itertools.chain(*expr))) else: expr = None # print(expr) return expr
def rank_preselection_by_popularity(path_train, behaviors_df, preselection_df): _, behaviors_train_df, preselection_train_df = read_data(path_train) items_popularity_df = behaviors_train_df.union(behaviors_df).groupby( 'item_id').agg(F.count('item_id').alias('popularity')).sort( F.desc('popularity')) # add the items in preselection_df that were not in behaviors_df with a 0 popularity items_popularity_df = items_popularity_df.join( preselection_df.select('item_id').distinct(), 'item_id', how='full').fillna(0) preselection_pop_df = preselection_df.join(items_popularity_df, 'item_id') preselection_pop_df = preselection_pop_df.withColumn( 'rank', F.row_number().over( Window.partitionBy('user', 'index').orderBy(F.desc('popularity')))) preselection_pop_df = preselection_pop_df.withColumn('dic', F.create_map(['item', 'rank']))\ .drop('item_id')\ .drop('rank')\ .drop('popularity')\ .drop('success') preselection_pop_df = preselection_pop_df.groupby('user', 'index').agg( F.collect_list('dic').alias('dic_list')) preselection_pop_df = preselection_pop_df.orderBy(['index', 'user'], ascending=[1, 1]) return items_popularity_df, preselection_pop_df
def _get_base_data(self, startdate, enddate, part_start, part_end): sql = """ select trade_id,prd_ind,trd_type,busi_date, sum(pre_mkt_val) pre_mkt_val, sum(now_mkt_val) now_mkt_val, sum(pos_cash_flow) pos_cash_flow, sum(neg_cash_flow) neg_cash_flow, max(exception_label) exception_label, sum(return) return from {2}.{3} where busi_date>='{0}' and busi_date<='{1}' and part>='{4}' and part<='{5}' and prd_no!='0.0' GROUP by trade_id,prd_ind,trd_type,busi_date """.format(startdate, enddate, self.adata, self.stock_cust_daily_holding, part_start, part_end) df = self.sparkSession.sql(sql).withColumn("detail_item", F.create_map(F.lit("pre_mkt_val"), "pre_mkt_val", F.lit("now_mkt_val"), "now_mkt_val", F.lit("pos_cash_flow"), "pos_cash_flow", F.lit("neg_cash_flow"), "neg_cash_flow", F.lit("exception_label"), "exception_label", F.lit("trd_type"), "trd_type", F.lit("return"), "return", F.lit("busi_date"), "busi_date"))\ .groupBy("trade_id", "prd_ind")\ .agg(F.collect_list("detail_item").alias("detail_list")) df.persist(StorageLevel.DISK_ONLY).count() return df
def apply_overwrite_dict_to_df(df, lookup_col, overwrite_dict): """ df : A spark dataframe lookup_col : The column name that should be used to apply fixes (e.g. col1) overwrite_dict : should be a dictionary where each key is the value of lookup_col you want to fix. (e.g. {'a' : {'col2' : 2}} to fix the value in col2 when col1 is equal to 'a') """ # Split overwrite_dict into a dictionary of single key value pairs dictionaries skvp = {} for k in overwrite_dict: for kk in overwrite_dict[k]: if kk not in skvp: skvp[kk] = {} skvp[kk][k] = overwrite_dict[k][kk] # for each col that is going to be overwritten apply the single key value pairs for k in skvp: mapping_expr = F.create_map( [F.lit(x) for x in chain(*skvp[k].items())]) df = df.withColumn( k, F.when(mapping_expr.getItem(df[lookup_col]).isNull(), df[k]).otherwise(mapping_expr.getItem(df[lookup_col])), ) return df
def Binning(df, num_col, no_of_buckets): for (a,b) in df.dtypes: if a==num_col: o_dtype = b tdf = df.withColumn(num_col, col(num_col).cast('double')) qds = QuantileDiscretizer(numBuckets=no_of_buckets, inputCol=num_col, outputCol="bucket_no") bucketizer = qds.fit(tdf) splits = bucketizer.getSplits() tdf = bucketizer.transform(tdf) bucket_dict = dict() for i in range(no_of_buckets): bucket_dict[float(i)] = str(splits[i]) + ' to ' + str(splits[i+1]) #tdf = tdf.withColumn('bucket_no', col(num_col).cast('string')) mapping_expr=create_map([lit(x) for x in chain(*bucket_dict.items())]) tdf = tdf.withColumn(num_col + '_bucket_range', mapping_expr.getItem(col('bucket_no'))) tdf = tdf.withColumn(num_col, col(num_col).cast(o_dtype)) return tdf, bucket_dict
def _as_categorical_type(index_ops: IndexOpsLike, dtype: CategoricalDtype, spark_type: DataType) -> IndexOpsLike: """Cast `index_ops` to categorical dtype, given `dtype` and `spark_type`.""" assert isinstance(dtype, CategoricalDtype) if dtype.categories is None: codes, uniques = index_ops.factorize() return codes._with_new_scol( codes.spark.column, field=codes._internal.data_fields[0].copy(dtype=CategoricalDtype( categories=uniques)), ) else: categories = dtype.categories if len(categories) == 0: scol = SF.lit(-1) else: kvs = chain(*[(SF.lit(category), SF.lit(code)) for code, category in enumerate(categories)]) map_scol = F.create_map(*kvs) scol = F.coalesce(map_scol[index_ops.spark.column], SF.lit(-1)) return index_ops._with_new_scol( scol.cast(spark_type), field=index_ops._internal.data_fields[0].copy( dtype=dtype, spark_type=spark_type, nullable=False), )
def test_flatten_schema_no_changes(spark): """Check non-struct columns are not affected""" df = spark.createDataFrame(df_p) df = df.withColumn("array", sf.array(sf.lit("a"), sf.lit("-"))) df = df.withColumn("map", sf.create_map(sf.lit("b"), sf.lit("_"))) result = flatten_df(df) assert df.columns == result.columns assert df.count() == result.count()
def test_column_getitem(self): from pyspark.sql.functions import col, create_map, lit map_col = create_map(lit(0), lit(100), lit(1), lit(200)) self.assertRaisesRegexp( Py4JJavaError, "Unsupported literal type class org.apache.spark.sql.Column id", lambda: map_col.getItem(col('id')))
def source_map(df, alias, extra_filter=""): m = F.create_map( list( chain(*((F.lit(name.split("_")[0]), F.col(name)) for name in df.columns if name != "addon_id" and extra_filter in name )))).alias(alias) return m
def generate_map_with_empty_validity(spark, path): gen_data = StructGen( [['number', IntegerGen()], ['word', LongGen()]], nullable=False) gen_df(spark, gen_data) df = gen_df(spark, gen_data) df_noNulls = df.filter("number is not null") df_map = df_noNulls.withColumn("map", f.create_map( ["number", "word"])).drop("number").drop("word") df_map.coalesce(1).write.parquet(path)
def _to_cat(index_ops: IndexOpsLike) -> IndexOpsLike: categories = cast(CategoricalDtype, index_ops.dtype).categories if len(categories) == 0: scol = SF.lit(None) else: kvs = chain(*[(SF.lit(code), SF.lit(category)) for code, category in enumerate(categories)]) map_scol = F.create_map(*kvs) scol = map_scol[index_ops.spark.column] return index_ops._with_new_scol(scol)
def MakeDict(df, keycol, valcol): mymap = df.select(create_map(keycol, valcol).alias('map')) mylist = mymap.select(collect_list(mymap.map).alias('dict')).head()['dict'] d = {} for elem in mylist: for key in elem: d[key] = elem[key] return d
def childMap(df, id, parentid, parentLookupCol, lookupCol): top = "top" + parentLookupCol rec = df.where((F.col(lookupCol).isNotNull()) & (F.col("parent_flag") == "false")) rec = rec.select(F.col(id), F.lit("").alias(parentid), F.col(lookupCol).alias(top)) child = rec.select(F.create_map([F.col(id), F.col(top)]).alias("childMap")) return child
def recode(col_name, map_dict, default=None): if not isinstance(col, Column): col_name = col(col_name) mapping_expr = create_map([lit(x) for x in chain(*map_dict.items())]) if default is None: return mapping_expr.getItem(col_name) else: return when(~isnull(mapping_expr.getItem(col_name)), mapping_expr.getItem(col_name)).otherwise(default)
def change_dimension_level_name(self, targetCol, defaltName="Others", topnLevel=None, newLevelNameDict=None): """ used to change level names of a particular dimension columns Parameters ---------- self : Object An Object of class DataFrameTransformer targetCol : list/tuple of strings column on which to apply this transformation. topnLevel : int or None Top levels to keep(by level count). all other levels will be clubbed as "Others" defaltName : basestring default Name given to all the Other levels newLevelNameDict : dict mapping for changing Level Name {"existingName1":"newName1","existingName2":"newName2"} Notes ---------- If both topnLevel and newLevelNameDict are provided then topnLevel will take precedence """ if topnLevel != None: topnLevel = topnLevel else: topnLevel = GLOBALSETTINGS.DTREE_TARGET_DIMENSION_MAX_LEVEL - 1 print targetCol for colName in targetCol: levelCountDict = self._metaParser.get_unique_level_dict(colName) levelCountArray = sorted(levelCountDict.items(), key=lambda x: x[1], reverse=True) countArr = [x[1] for x in levelCountArray] totalCount = sum(countArr) existinCount = sum(countArr[:topnLevel]) newLevelCount = levelCountArray[:topnLevel] newLevelCount.append((defaltName, totalCount - existinCount)) mappingDict = dict([(tup[0], tup[0]) if idx <= topnLevel - 1 else (tup[0], defaltName) for idx, tup in enumerate(levelCountArray)]) mapping_expr = create_map( [lit(x) for x in chain(*mappingDict.items())]) existingCols = self._data_frame.columns self._data_frame = self._data_frame.withColumnRenamed( colName, str(colName) + "JJJLLLLKJJ") self._data_frame = self._data_frame.withColumn( colName, mapping_expr.getItem(col(str(colName) + "JJJLLLLKJJ"))) self._data_frame = self._data_frame.select(existingCols) self._dataframe_helper.set_dataframe(self._data_frame) self._metaParser.update_level_counts(colName, dict(newLevelCount))
def parentMap(df, id, parentid, parentLookupCol, lookupCol): top = "top" + lookupCol rec = df.withColumn( "parent_flag", F.when(F.col(parentid) == "", "true").otherwise("false")) rec = rec.withColumn( top, F.when(F.col("parent_flag") == "true", F.col(parentLookupCol))) parent = rec.where(F.col("parent_flag") == "true") \ .select(F.create_map([F.col(id), F.col(top)]).alias("parentMap")) return parent
def replace(dataframe: DataFrame, column: str, replace_dict: Dict[str, str]) -> DataFrame: """Replace values of a string column in the dataframe using a dict. Example: >>> from butterfree.extract.pre_processing import replace ... from butterfree.testing.dataframe import ( ... assert_dataframe_equality, ... create_df_from_collection, ... ) >>> from pyspark import SparkContext >>> from pyspark.sql import session >>> spark_context = SparkContext.getOrCreate() >>> spark_session = session.SparkSession(spark_context) >>> input_data = [ ... {"id":1, "type": "a"}, {"id":2, "type": "b"}, {"id":3, "type": "c"} ... ] >>> input_df = create_df_from_collection(input_data, spark_context, spark_session) >>> input_df.collect() [Row(id=1, type='a'), Row(id=2, type='b'), Row(id=3, type='c')] >>> replace_dict = {"a": "type_a", "b": "type_b"} >>> replace(input_df, "type", replace_dict).collect() [Row(id=1, type='type_a'), Row(id=2, type='type_b'), Row(id=3, type='c')] Args: dataframe: data to be transformed. column: string column on the dataframe where to apply the replace. replace_dict: dict with values to be replaced. All mapped values must be string. Returns: Dataframe with column values replaced. """ if not isinstance(dataframe, DataFrame): raise ValueError("dataframe needs to be a Pyspark DataFrame type") if (column not in dict( dataframe.dtypes)) or (dict(dataframe.dtypes)[column] != "string"): raise ValueError( "column needs to be the name of an string column in dataframe") if (not isinstance(replace_dict, dict)) or (not all( isinstance(value, str) for value in chain(*replace_dict.items()))): raise ValueError("replace_dict needs to be a Python dict with " "all keys and values as string values") mapping = create_map( [lit(value) for value in chain(*replace_dict.items())] # type: ignore ) return dataframe.withColumn(column, coalesce(mapping[col(column)], col(column)))
def get_documents_df(self, data_files_path, redirects_files_path): """ Return a DataFrame containing the entities to be indexed. Redirects are filtered out, if given. :param data_files_path: path to .ttl file(s) (e.g., /dbpedia/all_data/*.ttl) :param redirects_files_path: path to .ttl file(s) (e.g., /dbpedia/redirects/*.ttl). :return: """ # DF schema: subj, pred, obj df = self._ttl_as_df(data_files_path) # Filter redirected entities, if any redirects = self._get_redirects(redirects_files_path) df = df.join(redirects, df.subj == redirects.subj, 'left_anti') # Replace RDF properties with index fields names mapping = F.create_map( [F.lit(x) for x in chain(*self._predicate2field.items())]) df = df \ .withColumn('pred', mapping[df.pred]) \ .dropna() # remove unknown properties # Swap subj and obj when pred = redirect (store the relation as subj hasRedirect obj) # Make subj the uri col uri_col = ElasticConfig.Fields.URI.value df = df \ .withColumn(uri_col, F.when(df.pred != ElasticConfig.Fields.REDIRECT.value, df.subj).otherwise(df.obj)) \ .withColumn('obj_new', F.when(df.pred != ElasticConfig.Fields.REDIRECT.value, df.obj).otherwise(df.subj)) \ .drop('subj', 'obj') \ .select(F.col(uri_col), F.col('pred'), F.col('obj_new').alias('obj')) # Pivot table grouping by uri; collect objects into lists df = df.groupBy(uri_col).pivot("pred").agg(F.collect_list('obj')) # Add a column with extra surface forms extra_surface_forms = F.udf(self._surface_forms_from_uri, ArrayType(StringType())) df = df.withColumn("extra_surface_forms", extra_surface_forms(uri_col)) # If the surface forms column already exists, merge it with the new one if ElasticConfig.Fields.SURFACE_FORM_KEYWORD.value in df.columns: merge_surface_forms = F.udf(lambda sf1, sf2: list({*sf1 + sf2}), ArrayType(StringType())) df = df \ .withColumn(ElasticConfig.Fields.SURFACE_FORM_KEYWORD.value, merge_surface_forms(ElasticConfig.Fields.SURFACE_FORM_KEYWORD.value, 'extra_surface_forms')) \ .drop('extra_surface_forms') else: # else just rename the new one df = df.withColumnRenamed( 'extra_surface_forms', ElasticConfig.Fields.SURFACE_FORM_KEYWORD.value) return df
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2) # COMMAND ---------- from pyspark.sql.functions import split, explode df.withColumn("splitted", split(col("Description"), " "))\ .withColumn("exploded", explode(col("splitted")))\ .select("Description", "InvoiceNo", "exploded").show(2) # COMMAND ---------- from pyspark.sql.functions import create_map df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\ .show(2) # COMMAND ---------- df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\ .selectExpr("complex_map['WHITE METAL LANTERN']").show(2) # COMMAND ---------- df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\ .selectExpr("explode(complex_map)").show(2)