def task_7(df): df.withColumn( "hour", col("timestamp").substr(instr(col("timestamp"), 'T') + 1, lit(2))).groupBy("hour").agg( count("hour").alias("frequency")).orderBy( desc("frequency")).show(1)
def count_substrings(substrings, field_name): if field_name not in reference.columns: return set() substrings = [substring.replace('.', '__DOT__') for substring in substrings] if signatures is not None: found_substrings = reference.select(['signature'] + [(functions.instr(reference[field_name], substring.replace('__DOT__', '.')) != 0).alias(str(substrings.index(substring))) for substring in substrings])\ .rdd\ .flatMap(lambda v: [(i, 1) for i in range(0, len(substrings)) if v[str(i)]] + ([] if v['signature'] not in broadcastSignatures.value else [((v['signature'], i), 1) for i in range(0, len(substrings)) if v[str(i)]]))\ .reduceByKey(lambda x, y: x + y)\ .filter(lambda (k, v): v >= MIN_COUNT)\ .collect() substrings_ref = [(substrings[elem[0]], elem[1]) for elem in found_substrings if isinstance(elem[0], int)] substrings_signatures = [elem for elem in found_substrings if not isinstance(elem[0], int)] substrings_groups = dict([(signature, [(substrings[i], count) for (s, i), count in substrings_signatures if s == signature]) for signature in signatures]) else: substrings_ref = reference.select([(functions.instr(reference[field_name], substring.replace('__DOT__', '.')) != 0).alias(substring) for substring in substrings])\ .rdd\ .flatMap(lambda v: [(substring, 1) for substring in substrings if v[substring]])\ .reduceByKey(lambda x, y: x + y)\ .filter(lambda (k, v): v >= MIN_COUNT)\ .collect() substrings_groups = dict() for group in groups: substrings_groups[group[0]] = group[1].select([(functions.instr(group[1][field_name], substring.replace('__DOT__', '.')) != 0).alias(substring) for substring in substrings])\ .rdd\ .flatMap(lambda v: [(substring, 1) for substring in substrings if v[substring]])\ .reduceByKey(lambda x, y: x + y)\ .filter(lambda (k, v): v >= MIN_COUNT)\ .collect() all_substrings_ref = set([substring for substring, count in substrings_ref if float(count) / total_reference > min_support_diff]) all_substrings_groups = dict([(group_name, set([substring for substring, count in substrings_groups[group_name] if float(count) / total_groups[group_name] > min_support_diff])) for group_name in group_names]) all_substrings = all_substrings_ref.union(*all_substrings_groups.values()) substrings_ref = [(substring, count) for substring, count in substrings_ref if substring in all_substrings] for group_name in group_names: substrings_groups[group_name] = [(substring, count) for substring, count in substrings_groups[group_name] if substring in all_substrings_ref.union(all_substrings_groups[group_name])] save_results(substrings_ref, substrings_groups) return all_substrings
def task_9(df): df.filter((df.retrieval_stage != 'geolocator') & (df.operation_part.contains('Failed'))).withColumn( "access_key", col("operation_part").substr( instr(col("operation_part"), 'Access') + 8, lit(11))).groupBy("access_key").agg( count("access_key").alias("frequency")).orderBy( desc("frequency")).show(10)
def test_booleans_on_columns(self): DOT_code_filter = col("StockCode") == "DOT" price_filter = col('UnitPrice') > 600 description_filter = instr(col("Description"), "POSTAGE") >= 1 new_df = self.df.withColumn( "isExpensive", DOT_code_filter & (price_filter | description_filter))\ .where("isExpensive").select("unitPrice", "isExpensive").collect() for x in new_df: self.assertEqual(x['isExpensive'], True)
def filterin_contains(dataframe, column, substr, coltype): if coltype is None: logging.warning( "column {0} type is unknown, cannot filter by contains {1}".format( column, substr)) return dataframe if coltype['dataType'] == 'string': logging.info("apply filter contains by {0} on column {1}".format( substr, column)) dataframe = dataframe.filter(instr(col(column), substr) != 0) return dataframe
def test_or_statement_with_filter(self): price_filter = col('UnitPrice') > 600 descrip_filter = instr(self.df.Description, "POSTAGE") >= 1 filtered_df = self.df.where( self.df.StockCode.isin("DOT")).where(price_filter | descrip_filter) self.assertEqual(2, filtered_df.count())
# select 5, 'five', 5.0 # Working w/ Booleans # Boolean statement consist of four elements: and, or , true and false # These statements are often used as conditional requirements for when a row of data must either pass the test or else it will be filtered out from pyspark.sql.functions import col, instr, expr df.where(col('InvoiceNo') != 536365)\ .select('InvoiceNo', 'Description')\ .show(5, False) df.where('InvoiceNo <> 536365') priceFilter = col('UnitPrice') > 600 descripFilter = instr(df.Description, 'POSTAGE') >= 1 # like a find() in string function that returns 0 or 1 df.where(df.StockCode.isin('DOT')).where(priceFilter | descripFilter).show() # Boolean expresions are not just reserved to filters. To filter a DF, you can also specify a Boolean column DOTCodeFilter = col('StockCode') == 'DOT' priceFilter_ = col('UnitPrice') > 600 descripFilter_ = instr(col('Description'), 'POSTAGE') >=1 df.withColumn('isExpensive', DOTCodeFilter & (priceFilter_ | descripFilter_))\ .where('isExpensive')\ .select('unitPrice', 'isExpensive')\ .show(5) # If fact, it's often easier to just express filters as SQL statements than using the programmatic DF interface
def augment(df): if 'addons' in df.columns: df = df.select(['*'] + [create_get_addon_name_udf(addon)(df['addons']).alias(addon.replace('.', '__DOT__')) for addon in all_addons] + [create_get_addon_version_udf(addon)(df['addons']).alias(addon.replace('.', '__DOT__') + '-version') for addon in all_addons]) if 'json_dump' in df.columns: df = df.select(['*'] + [functions.array_contains(df['json_dump']['modules']['filename'], module_name).alias(module_id) for module_id, module_name in module_ids.items()]) if 'plugin_version' in df.columns: df = df.withColumn('plugin', df['plugin_version'].isNotNull()) if 'app_notes' in df.columns: df = df.select(['*'] + [(functions.instr(df['app_notes'], app_note.replace('__DOT__', '.')) != 0).alias(app_note) for app_note in all_app_notes] + [(functions.instr(df['app_notes'], 'Has dual GPUs') != 0).alias('has dual GPUs')]) if 'graphics_critical_error' in df.columns: df = df.select(['*'] + [(functions.instr(df['graphics_critical_error'], error.replace('__DOT__', '.')) != 0).alias(error) for error in all_gfx_critical_errors]) if 'total_virtual_memory' in df.columns and 'platform_version' in df.columns and 'platform' in df.columns: def get_arch(total_virtual_memory, platform, platform_version): if total_virtual_memory: try: if int(total_virtual_memory) < 2684354560: return 'x86' else: return 'amd64' except: return 'unknown' elif platform == 'Mac OS X': return 'amd64' else: if 'i686' in platform_version: return 'x86' elif 'x86_64' in platform_version: return 'amd64' get_arch_udf = functions.udf(get_arch, StringType()) df = df.withColumn('os_arch', get_arch_udf(df['total_virtual_memory'], df['platform'], df['platform_version'])) if 'adapter_driver_version' in df.columns: def get_driver_version(adapter_vendor_id, adapter_driver_version): # XXX: Sometimes we have a driver which is not actually made by the vendor, # in those cases these rules are not valid (e.g. 6.1.7600.16385). if adapter_driver_version: if adapter_vendor_id == '0x8086' or adapter_vendor_id == '8086': return adapter_driver_version[adapter_driver_version.rfind('.') + 1:] elif adapter_vendor_id == '0x10de' or adapter_vendor_id == '10de': return adapter_driver_version[-6:-5] + adapter_driver_version[-4:-2] + '.' + adapter_driver_version[-2:] # TODO: AMD? return adapter_driver_version get_driver_version_udf = functions.udf(get_driver_version, StringType()) df = df.withColumn('adapter_driver_version_clean', get_driver_version_udf(df['adapter_vendor_id'], df['adapter_driver_version'])) if 'cpu_info' in df.columns: df = df.withColumn('CPU Info', functions.substring_index(df['cpu_info'], ' | ', 1)) df = df.withColumn('Is Multicore', functions.substring_index(df['cpu_info'], ' | ', -1) != '1') if 'dom_ipc_enabled' in df.columns: df = df.withColumnRenamed('dom_ipc_enabled', 'e10s_enabled') if 'memory_ghost_windows' in df.columns: df = df.withColumn('ghost_windows > 0', df['memory_ghost_windows'] > 0) if 'memory_top_none_detached' in df.columns: df = df.withColumn('top(none)/detached > 0', df['memory_top_none_detached'] > 0) return df
def test_working_wit_filters(self): priceFilter = col("UnitPrice") > 600 descripFilter = instr(self.df.Description, "POSTAGE") >= 1 df = self.df.where( self.df.StockCode.isin("DOT")).where(priceFilter | descripFilter) self.assertEqual(df.count(), 2)
.show(5, False) #2 print("2") df.where("InvoiceNo = 536365")\ .show(5, False) #3 print("3") df.where("InvoiceNo <> 536365")\ .show(5, False) #4 print("4") priceFilter = col("UnitPrice") > 600 descriptionFilter = instr(df.Description, "WHITE") >= 1 df.where(df.StockCode.isin("DOT"))\ .where(priceFilter | descriptionFilter)\ .show() """ = SELECT * FROM dfTable WHERE StockCOde in ("DOT") AND (UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1) """ #5 Defines what is expensive print("5") DOTCodeFilter = col("StockCode") == "DOT" priceFilter = col("UnitPrice") > 600
"/databricks/retail-data/by-day/2010-12-01.csv")) retail_df.printSchema() retail_df.createOrReplaceTempView("reatail_table") #%% "working with booleans " from pyspark.sql.functions import expr, col print( retail_df.filter(col('InvoiceNo') == 536365).select( "InvoiceNo", "Description").show(5, False)) #%% "conditional statements working with booleans" from pyspark.sql.functions import instr priceFilter = col("UnitPrice") > 600 # filter condition 1 descripFilter = instr(retail_df.Description, "POSTAGE") >= 1 # filter condition 2 print('or statement : \n') print( retail_df.where(priceFilter | descripFilter).select( 'InvoiceNo', 'UnitPrice').show(3, False)) #or statement print('and statements : \n') print( retail_df.where(priceFilter & descripFilter).select( 'InvoiceNo', 'UnitPrice').show(3, False)) #or statement print( retail_df.where(priceFilter).filter(descripFilter).select( 'InvoiceNo', 'UnitPrice').show(3, False)) #or statement
def create_filter(sub_string, column_name): # return locate(sub_string.upper(), column_name).cast('boolean').alias('contains_' + sub_string) return (instr(column_name, sub_string.upper()) >= 1).alias('contains_' + sub_string)
schema = StructType([ StructField('user_id', LongType(), True), StructField('status', StringType(), True) ]) user_status = spark.read.csv(path=path, encoding='UTF-8', sep='\t', schema=schema, quote="", mode='DROPMALFORMED', ignoreLeadingWhiteSpace=True) users_suspended = user_status.filter( F.instr(user_status.status, 'Account Suspended') > 0).select( 'user_id').cache() # ## Twitter datasets # 0. User vs. URLs # 0. User vs. Mentions # 0. User vs. Hashtags # In[2]: # tweets = spark.read.parquet('/user/aralytics-parquet/2015/01') # tweets_count = tweets.count() # users_count = tweets.select('user.id').distinct().count() # hashtags_count = tweets.select(F.explode('entities.hashtags.text')).distinct().count() # urls_count = tweets.select(F.explode('entities.urls.expanded_url')).distinct().count()
df.where(col("InvoiceNo") == "536365")\ .select("InvoiceNo","Description")\ .show(5,False) #another way df.where("InvoiceNo = 536367")\ .show(5,False) #using multiple filters from pyspark.sql.functions import instr priceFilter = col("UnitPrice") > 600 descripFilter = instr(df.Description, "POSTAGE") >= 1 df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show() #specify a boolean column DOTCodeFilter = col("StockCode") == "DOT" priceFilter = col("UnitPrice") > 600 descripFilter = instr(df.Description, "POSTAGE") >= 1 df.withColumn("isExpensive", DOTCodeFilter & (priceFilter|descripFilter))\ .where("isExpensive")\ .select("UnitPrice","isExpensive")\ .show(5,False) #using expressions from pyspark.sql.functions import expr
regexp_extract(col("Description"), extract_str, 1) .alias("color_cleaned"), col("Description"))\ .show(2) """ SELECT regexp_extract(Description, '(BLACK|WHITE|RED|GREEN|BLUE)', 1), Description FROM dfTable """ #4 filter if it contains that string print("4") containsBlack = instr(col("Description"), "BLACK") >= 1 containsWhite = instr(col("Description"), "WHITE") >= 1 df.withColumn("hasSimpleColor", containsBlack | containsWhite)\ .filter("hasSimpleColor")\ .select("Description")\ .show(3, False) """ SELECT Description FROM dfTable WHERE instr(Description, 'BLACK') >= 1 OR instr(Description, 'WHITE') >= 1 """
from pyspark.sql import SparkSession from pyspark.sql.functions import regexp_replace,col,translate,regexp_extract,instr spark = SparkSession.builder.appName("Pyspark example").getOrCreate() df= spark.read.format("csv").option("header","true").option("inferSchema","true").load("C:/Users/Lenovo/Desktop/spark_data/retail_store.csv") #'regexp_replace' is used to replace substitute color names with NOCOLOR str1="BLACK|WHITE|RED|BLUE|GREEN" df.select(regexp_replace(col("Description"),str1,"NOCOLOR").alias("no_color_column"),col("Description")).show(5) #'translate' function is to replace given characters with other characters df.select(translate(col("Description"),"ABCD","1234"),col("Description")).show(5) #'regexp_extract' is used to extract values df.select(regexp_extract(col("Description"),str1,0).alias("color"),col("Description")).show(5) #'instr' function checks for the existance of a value containsRed= instr(col("Description"),"RED")>=1 containsWhite= instr(col("Description"),"WHITE")>=1 df.withColumn("hasColor",containsWhite| containsRed).where("hasColor").select("Description").show(5)
from pyspark.sql import SparkSession from pyspark.sql.functions import col, expr, column, lit, avg, lpad, regexp_extract, instr if __name__ == '__main__': spark = SparkSession.builder.appName("learning").master( "local").getOrCreate() df = spark.read.format('csv')\ .option('sep', ';')\ .option('header', 'true')\ .load('user.csv') df.select(lit(5), lit(5.5), lit("5.5"), lit("hola")).show() print( df.select(lit(5), lit(5.5), lit("5.5"), lit("5.5"), lit("hola")).dtypes) df.select(lpad(lit("HELLO"), 3, " ")).show() df.select(col('name').isin(['jorge'])).show() df.select(instr(col('name'), 'Jorge')).show()
from pyspark.sql import SparkSession from pyspark.sql.functions import col, expr, column, lit, avg, monotonically_increasing_id, rand, locate, instr if __name__ == '__main__': spark = SparkSession.builder.appName("learning").master( "local").getOrCreate() df = spark.read.format('csv')\ .option('sep', ';')\ .option('header', 'true')\ .load('user.csv') df.select(instr(col('name'), 'Jorge'), locate('Jorge', col('name'))).show() df.select(rand().alias("random")).where(expr("random > 0")).show()
def MyTransform(glueContext, dfc) -> DynamicFrameCollection: dyf_soggetti = dfc.select("Soggetti") dyf_contratti = dfc.select("Contratti") dyf_credito = dfc.select("Credito") dyf_prodotti = dfc.select("Prodotti") dyf_punti_di_fornitura = dfc.select("PuntiDiFornitura") #### Deduplica Soggetti deduplica_soggetti = dyf_soggetti.toDF() windowSpec=Window.partitionBy(deduplica_soggetti.nome,deduplica_soggetti.cognome).\ orderBy(F.col("key_soggetti").desc()) deduplica_soggetti = deduplica_soggetti.withColumn( "rank", F.row_number().over(windowSpec)).filter(F.col("rank") == 1) ### Numero Contratti per soggetto df_contratti_per_soggetto = dyf_contratti.toDF() df_contratti_per_soggetto = df_contratti_per_soggetto.groupBy( "key_soggetti").count() ### Debito Medio per soggetto df_debito_medio_per_cliente = dyf_credito.toDF() df_debito_medio_per_cliente=df_debito_medio_per_cliente.withColumn("d_importo", (F.col("importo").\ substr(F.lit(1), F.instr(F.col("importo"), '€')-2)).cast('double')) df_debito_medio_per_cliente = df_debito_medio_per_cliente.groupBy( "key_soggetti").agg(F.mean("d_importo")) df_debito_medio_per_cliente = df_debito_medio_per_cliente.withColumnRenamed( "avg(d_importo)", "debito_medio") ### ELE e Gas Medio, minimo anno_prima_attivazione_fornitura su contratti attivo df_contratti = dyf_contratti.toDF() df_contratti=df_contratti.withColumn("ts_data_attivazione_fornitura",F.to_timestamp (df_contratti.data_attivazione_fornitura)).\ withColumn ("ts_data_cessazione_fornitura",F.to_timestamp (df_contratti.data_cessazione_fornitura)).\ drop("key_punti_di_fornitura","data_cessazione_fornitura","codice_contratto","key_contratti", "canale_di_vendita","anno_prima_attivazione_fornitura") cd = F.current_timestamp() df_contratti=df_contratti.filter (df_contratti.ts_data_cessazione_fornitura >= cd).\ filter (df_contratti.ts_data_attivazione_fornitura <= cd) df_prodotti = dyf_prodotti.toDF() df_prodotti=df_prodotti.withColumn("ts_data_inizio_validita",F.to_timestamp (df_prodotti.data_inizio_validita)).\ withColumn ("ts_data_fine_validita",F.to_timestamp (df_prodotti.data_fine_validita)).\ drop ('data_inizio_validita','data_fine_validita','key_prodotti') df_prodotti = df_prodotti.withColumnRenamed("nome_prodotto", "nome_commerciale") df_prodotti=df_prodotti.withColumn("ELE", (df_prodotti.f0+ df_prodotti.f1+ df_prodotti.f2+ df_prodotti.f3)/4).\ drop('f0','f1','f2','f3') df_tariffe_contratti=df_contratti.join (df_prodotti,"nome_commerciale").\ withColumn ("realGas",F.when (F.col('vettore')=='GAS',df_prodotti.gas).otherwise (None)).\ withColumn ("realEle",F.when (F.col('vettore')=='ELE',df_prodotti.ELE).otherwise (None)).\ drop ('gas','ELE') df_tariffe_soggetti=df_tariffe_contratti.groupby (df_tariffe_contratti.key_soggetti).\ agg(F.min("data_attivazione_fornitura"),F.mean("realGas"),F.mean("realEle")) df_tariffe_soggetti=df_tariffe_soggetti.withColumnRenamed("min(data_attivazione_fornitura)","data_attivazione_fornitura").\ withColumnRenamed("avg(realGas)","media_GAS").\ withColumnRenamed("avg(realEle)","media_ELE") ## Calcolo dell'indice di churn e del canale di contatto preferenziale df_contratti = dyf_contratti.toDF() df_soggetti_canale_vendita=df_contratti.groupBy (df_contratti.key_soggetti,df_contratti.canale_di_vendita).\ agg (F.count(df_contratti.canale_di_vendita)) windowSpec=Window.partitionBy("key_soggetti").\ orderBy(F.col("count(canale_di_vendita)").desc()) df_soggetti_canale_vendita=df_soggetti_canale_vendita.\ withColumn("rank",F.row_number().over (windowSpec)).filter (F.col("rank")==1) df_soggetti_canale_vendita = df_soggetti_canale_vendita.drop( 'count(canale_di_vendita)', 'rank') df_contratti_churn = df_contratti.withColumn( "hadChurn", F.when(F.col('data_cessazione_fornitura') < cd, 1).otherwise(0)) df_contratti_churn = df_contratti_churn.groupBy("key_soggetti").agg( F.sum("hadChurn")) df_contratti_churn=df_contratti_churn.\ withColumn("Churn",F.when (F.col('sum(hadChurn)')>=1,1).otherwise (0)).\ drop("sum(hadChurn)") ### Unpivot tabella regioni df_contratti = dyf_contratti.toDF() df_fornitura = dyf_punti_di_fornitura.toDF() df_fonitura_per_contratto=df_contratti.join(df_fornitura,"key_punti_di_fornitura").\ groupBy ("key_soggetti","regione").\ pivot("regione").\ agg (F.count("key_punti_di_fornitura")) ### Preparazione Output output=deduplica_soggetti.join (df_contratti_per_soggetto,"key_soggetti").\ join(df_debito_medio_per_cliente,"key_soggetti").\ join(df_tariffe_soggetti,"key_soggetti").\ join(df_contratti_churn,"key_soggetti").\ join (df_soggetti_canale_vendita,"key_soggetti").\ join (df_fonitura_per_contratto,"key_soggetti") dyf_output = DynamicFrame.fromDF(output, glueContext, "output") return (DynamicFrameCollection({"CustomTransform0": dyf_output}, glueContext))
def read_spark_df_from_msexchange_data_store(self, **args): url = args["hbase_url"] r = requests.get(url) # Converting api data in json file try: d = r.json() except: print("Invalid URL") # Checking for data availability if len(d) == 0: print( "There are no events to process. Please enter a different search criteria in the url." ) # Converting API data into Spark Dataframe print("Reading the data from profiler...") spark = SparkSession.builder.appName( 'mseapi').enableHiveSupport().getOrCreate() sc = spark.sparkContext tsRDD = sc.parallelize(d) df_mail = spark.read.option('multiline', "true").json(tsRDD) total_evt_count = df_mail.count() print("Total number of records: " + str(total_evt_count)) if total_evt_count > 0: mail_len = f.udf(lambda s: len(s), LongType()) mail_sum = f.udf(lambda s: sum(s), LongType()) # mail_mean = f.udf(lambda s: round(mean(s),4), FloatType()) # mail_stdev = f.udf(lambda s: round(stdev(s),4), FloatType()) df_mail_grp = df_mail.filter(f.length(f.trim(df_mail["mail_size"]))>0)\ .withColumn("check", f.when(f.instr(df_mail["mail_size"],',') == 1,f.substring_index(df_mail["mail_size"],',',-1)).otherwise(df_mail["mail_size"]))\ .withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\ .withColumn("mail_size", f.regexp_replace('check', ' ', ''))\ .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',') .cast(ArrayType(IntegerType())).alias("email_size"), f.sum("ext_sndrs").alias("ext_sndrs"))\ .withColumn("no_of_emails", mail_len("email_size"))\ .withColumn("tot_email_size", mail_sum("email_size"))\ .withColumn("avg_email_size", f.round(f.col("tot_email_size")/ f.col("no_of_emails"),4))\ .drop("email_size") #.withColumn("email_size_mean", mail_mean("email_size"))\ #.withColumn("email_size_stdev", f.when(mail_len("email_size") > 1,mail_stdev("email_size")))\ # df_mail_grp = df_mail.filter(f.length(f.trim(df_mail["mail_size"]))>0)\ # .withColumn("check", f.when(f.instr(df_mail["mail_size"],',') == 1,f.substring_index(df_mail["mail_size"],',',-1)).otherwise(df_mail["mail_size"]))\ # .withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\ # .withColumn("mail_size", f.regexp_replace('check', ' ', ''))\ # .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',') # .cast(ArrayType(IntegerType())).alias("email_size"), # f.sum("ext_sndrs").alias("ext_sndrs"))\ # .withColumn("no_of_emails", mail_len("email_size"))\ # .withColumn("tot_email_size", mail_sum("email_size"))\ # .withColumn("avg_email_size", f.round(f.col("tot_email_size")/ f.col("no_of_emails"),4))\ # .drop("email_size") # #.withColumn("email_size_mean", mail_mean("email_size"))\ # #.withColumn("email_size_stdev", f.when(mail_len("email_size") > 1,mail_stdev("email_size")))\ # df_mail_grp = df_mail.withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\ # .withColumn("mail_size", f.regexp_replace('mail_size', ' ', ''))\ # .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',') # .cast(ArrayType(IntegerType())).alias("email_size"), # f.sum("ext_sndrs").alias("ext_sndrs"))\ # .withColumn("no_of_emails", mail_len("email_size"))\ # .withColumn("tot_email_size", mail_sum("email_size"))\ # .withColumn("email_size_mean", mail_mean("email_size"))\ # .withColumn("email_size_stdev", mail_stdev("email_size"))\ # .drop("email_size") df_mail_grp.show(3) return df_mail_grp else: schema = StructType([]) sqlContext = SQLContext(sc) sdf = sqlContext.createDataFrame(sc.emptyRDD(), schema) return sdf
df.select(lit(5), lit("five"), lit(5.0)) # COMMAND ---------- from pyspark.sql.functions import col df.where(col("InvoiceNo") != 536365)\ .select("InvoiceNo", "Description")\ .show(5, False) # COMMAND ---------- from pyspark.sql.functions import instr priceFilter = col("UnitPrice") > 600 descripFilter = instr(df.Description, "POSTAGE") >= 1 df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show() # COMMAND ---------- from pyspark.sql.functions import instr DOTCodeFilter = col("StockCode") == "DOT" priceFilter = col("UnitPrice") > 600 descripFilter = instr(col("Description"), "POSTAGE") >= 1 df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\ .where("isExpensive")\ .select("unitPrice", "isExpensive").show(5) # COMMAND ----------
def get_top_words(dataset, signatures): # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer # Or translate comments in other languages using the free Microsoft Translate API. sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1))) if sentenceData.rdd.isEmpty(): return dict() # Tokenize comments. tokenizer = Tokenizer(inputCol='user_comments', outputCol='words') wordsData = tokenizer.transform(sentenceData) # Remove duplicate words from comments. wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words']) if wordsData.rdd.isEmpty(): print("[WARNING]: wordsData is empty, sentenceData wasn't.") return dict() # Clean comment words by removing puntuaction and stemming. def clean_word(w): return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower())) wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words']) # XXX: Useless with TF-IDF? remover = StopWordsRemover(inputCol='words', outputCol='filtered') cleanWordsData = remover.transform(wordsData) cv = CountVectorizer(inputCol='filtered', outputCol='features') model = cv.fit(cleanWordsData) featurizedData = model.transform(cleanWordsData) idf = IDF(inputCol='features', outputCol='tfidf_features') idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect() return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
#주어진 문자를 다른 문자로 치환 from pyspark.sql.functions import translate df.select(translate(col("Description"), "WHI", "123")).show(2) #color name 추출 from pyspark.sql.functions import regexp_extract extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)" df.select( regexp_extract(col("Description"), extract_str, 1).alias("color_clean")).show(6) #data의 존재여부 확인 #instr from pyspark.sql.functions import instr containBlack = instr(col("Description"), "BLACK") >= 1 df.withColumn("HasSimpleColor",containBlack)\ .where("HasSimpleColor")\ .select("Description").show(15,False) #인수의 개수가 동적으로 변할 때 from pyspark.sql.functions import expr, locate simpleColors = ["black", "white", "red", "green", "blue"] def color_locator(column, color_string): return locate(color_string.upper(), column)\ .cast("boolean")\ .alias("is_" + color_string)
rank().over(spec)).withColumn( "dense_rank_val", dense_rank().over(spec)).show() # COMMAND ---------- from pyspark.sql.functions import col, substring, substring_index, instr, split, concat_ws, repeat from pyspark.sql.types import StringType #substring #orders_new_col.show() func_df = orders_new_col.select( 'order_status', substring('order_status', 1, 2).alias("sub"), substring_index('order_status', "E", -3).alias("sub_ind")).select( "*", instr('sub_ind', 'E').alias("instr_val"), split('order_status', "_")[0].alias("split_val")).select( "*", concat_ws("|", "order_status", "sub").alias("concat_val")) func_df.withColumn("repeat_val", repeat("instr_val", 3)).select( "*", concat_ws("|", *func_df.columns).alias("conc_ws")).show(truncate=False) #orders_new_col.select(substring_index('order_status', "_", 2)).show() #list_1 = ["col_1", "col_2"] #df_1 = spark.createDataFrame(list_1, StringType()) #df_1.select(substring_index("value", "_", 1)).show() # COMMAND ---------- #Date from pyspark.sql.functions import current_timestamp, current_date, date_format, dayofyear, year, month, date_add, date_sub, datediff, add_months, months_between, next_day, last_day, date_trunc, lit