def task_7(df):
    df.withColumn(
        "hour",
        col("timestamp").substr(instr(col("timestamp"), 'T') + 1,
                                lit(2))).groupBy("hour").agg(
                                    count("hour").alias("frequency")).orderBy(
                                        desc("frequency")).show(1)
예제 #2
0
    def count_substrings(substrings, field_name):
        if field_name not in reference.columns:
            return set()

        substrings = [substring.replace('.', '__DOT__') for substring in substrings]

        if signatures is not None:
            found_substrings = reference.select(['signature'] + [(functions.instr(reference[field_name], substring.replace('__DOT__', '.')) != 0).alias(str(substrings.index(substring))) for substring in substrings])\
            .rdd\
            .flatMap(lambda v: [(i, 1) for i in range(0, len(substrings)) if v[str(i)]] + ([] if v['signature'] not in broadcastSignatures.value else [((v['signature'], i), 1) for i in range(0, len(substrings)) if v[str(i)]]))\
            .reduceByKey(lambda x, y: x + y)\
            .filter(lambda (k, v): v >= MIN_COUNT)\
            .collect()

            substrings_ref = [(substrings[elem[0]], elem[1]) for elem in found_substrings if isinstance(elem[0], int)]
            substrings_signatures = [elem for elem in found_substrings if not isinstance(elem[0], int)]
            substrings_groups = dict([(signature, [(substrings[i], count) for (s, i), count in substrings_signatures if s == signature]) for signature in signatures])
        else:
            substrings_ref = reference.select([(functions.instr(reference[field_name], substring.replace('__DOT__', '.')) != 0).alias(substring) for substring in substrings])\
            .rdd\
            .flatMap(lambda v: [(substring, 1) for substring in substrings if v[substring]])\
            .reduceByKey(lambda x, y: x + y)\
            .filter(lambda (k, v): v >= MIN_COUNT)\
            .collect()

            substrings_groups = dict()
            for group in groups:
                substrings_groups[group[0]] = group[1].select([(functions.instr(group[1][field_name], substring.replace('__DOT__', '.')) != 0).alias(substring) for substring in substrings])\
                .rdd\
                .flatMap(lambda v: [(substring, 1) for substring in substrings if v[substring]])\
                .reduceByKey(lambda x, y: x + y)\
                .filter(lambda (k, v): v >= MIN_COUNT)\
                .collect()

        all_substrings_ref = set([substring for substring, count in substrings_ref if float(count) / total_reference > min_support_diff])
        all_substrings_groups = dict([(group_name, set([substring for substring, count in substrings_groups[group_name] if float(count) / total_groups[group_name] > min_support_diff])) for group_name in group_names])
        all_substrings = all_substrings_ref.union(*all_substrings_groups.values())

        substrings_ref = [(substring, count) for substring, count in substrings_ref if substring in all_substrings]
        for group_name in group_names:
            substrings_groups[group_name] = [(substring, count) for substring, count in substrings_groups[group_name] if substring in all_substrings_ref.union(all_substrings_groups[group_name])]

        save_results(substrings_ref, substrings_groups)

        return all_substrings
def task_9(df):
    df.filter((df.retrieval_stage != 'geolocator')
              & (df.operation_part.contains('Failed'))).withColumn(
                  "access_key",
                  col("operation_part").substr(
                      instr(col("operation_part"), 'Access') + 8,
                      lit(11))).groupBy("access_key").agg(
                          count("access_key").alias("frequency")).orderBy(
                              desc("frequency")).show(10)
 def test_booleans_on_columns(self):
     DOT_code_filter = col("StockCode") == "DOT"
     price_filter = col('UnitPrice') > 600
     description_filter = instr(col("Description"), "POSTAGE") >= 1
     new_df = self.df.withColumn(
         "isExpensive", DOT_code_filter & (price_filter | description_filter))\
         .where("isExpensive").select("unitPrice", "isExpensive").collect()
     for x in new_df:
         self.assertEqual(x['isExpensive'], True)
예제 #5
0
def filterin_contains(dataframe, column, substr, coltype):
    if coltype is None:
        logging.warning(
            "column {0} type is unknown, cannot filter by contains {1}".format(
                column, substr))
        return dataframe
    if coltype['dataType'] == 'string':
        logging.info("apply filter contains by {0} on column {1}".format(
            substr, column))
        dataframe = dataframe.filter(instr(col(column), substr) != 0)
    return dataframe
 def test_or_statement_with_filter(self):
     price_filter = col('UnitPrice') > 600
     descrip_filter = instr(self.df.Description, "POSTAGE") >= 1
     filtered_df = self.df.where(
         self.df.StockCode.isin("DOT")).where(price_filter | descrip_filter)
     self.assertEqual(2, filtered_df.count())
예제 #7
0
# select 5, 'five', 5.0


# Working w/ Booleans
# Boolean statement consist of four elements: and, or , true and false
# These statements are often used as conditional requirements for when a row of data must either pass the test or else it will be filtered out

from pyspark.sql.functions import col, instr, expr
df.where(col('InvoiceNo') != 536365)\
        .select('InvoiceNo', 'Description')\
        .show(5, False)

df.where('InvoiceNo <> 536365')

priceFilter = col('UnitPrice') > 600
descripFilter = instr(df.Description, 'POSTAGE') >= 1 # like a find() in string function that returns 0 or 1
df.where(df.StockCode.isin('DOT')).where(priceFilter | descripFilter).show()

# Boolean expresions are not just reserved to filters. To filter a DF, you can also specify a Boolean column

DOTCodeFilter = col('StockCode') == 'DOT'
priceFilter_ = col('UnitPrice') > 600
descripFilter_ = instr(col('Description'), 'POSTAGE') >=1
df.withColumn('isExpensive', DOTCodeFilter & (priceFilter_ | descripFilter_))\
        .where('isExpensive')\
        .select('unitPrice', 'isExpensive')\
        .show(5)


# If fact, it's often easier to just express filters as SQL statements than using the programmatic DF interface
예제 #8
0
    def augment(df):
        if 'addons' in df.columns:
            df = df.select(['*'] + [create_get_addon_name_udf(addon)(df['addons']).alias(addon.replace('.', '__DOT__')) for addon in all_addons] + [create_get_addon_version_udf(addon)(df['addons']).alias(addon.replace('.', '__DOT__') + '-version') for addon in all_addons])

        if 'json_dump' in df.columns:
            df = df.select(['*'] + [functions.array_contains(df['json_dump']['modules']['filename'], module_name).alias(module_id) for module_id, module_name in module_ids.items()])

        if 'plugin_version' in df.columns:
            df = df.withColumn('plugin', df['plugin_version'].isNotNull())

        if 'app_notes' in df.columns:
            df = df.select(['*'] + [(functions.instr(df['app_notes'], app_note.replace('__DOT__', '.')) != 0).alias(app_note) for app_note in all_app_notes] + [(functions.instr(df['app_notes'], 'Has dual GPUs') != 0).alias('has dual GPUs')])

        if 'graphics_critical_error' in df.columns:
            df = df.select(['*'] + [(functions.instr(df['graphics_critical_error'], error.replace('__DOT__', '.')) != 0).alias(error) for error in all_gfx_critical_errors])

        if 'total_virtual_memory' in df.columns and 'platform_version' in df.columns and 'platform' in df.columns:
            def get_arch(total_virtual_memory, platform, platform_version):
                if total_virtual_memory:
                    try:
                        if int(total_virtual_memory) < 2684354560:
                            return 'x86'
                        else:
                            return 'amd64'
                    except:
                        return 'unknown'
                elif platform == 'Mac OS X':
                    return 'amd64'
                else:
                    if 'i686' in platform_version:
                        return 'x86'
                    elif 'x86_64' in platform_version:
                        return 'amd64'

            get_arch_udf = functions.udf(get_arch, StringType())

            df = df.withColumn('os_arch', get_arch_udf(df['total_virtual_memory'], df['platform'], df['platform_version']))

        if 'adapter_driver_version' in df.columns:
            def get_driver_version(adapter_vendor_id, adapter_driver_version):
                # XXX: Sometimes we have a driver which is not actually made by the vendor,
                #      in those cases these rules are not valid (e.g. 6.1.7600.16385).
                if adapter_driver_version:
                    if adapter_vendor_id == '0x8086' or adapter_vendor_id == '8086':
                        return adapter_driver_version[adapter_driver_version.rfind('.') + 1:]
                    elif adapter_vendor_id == '0x10de' or adapter_vendor_id == '10de':
                        return adapter_driver_version[-6:-5] + adapter_driver_version[-4:-2] + '.' + adapter_driver_version[-2:]
                    # TODO: AMD?

                return adapter_driver_version

            get_driver_version_udf = functions.udf(get_driver_version, StringType())

            df = df.withColumn('adapter_driver_version_clean', get_driver_version_udf(df['adapter_vendor_id'], df['adapter_driver_version']))

        if 'cpu_info' in df.columns:
            df = df.withColumn('CPU Info', functions.substring_index(df['cpu_info'], ' | ', 1))
            df = df.withColumn('Is Multicore', functions.substring_index(df['cpu_info'], ' | ', -1) != '1')

        if 'dom_ipc_enabled' in df.columns:
            df = df.withColumnRenamed('dom_ipc_enabled', 'e10s_enabled')

        if 'memory_ghost_windows' in df.columns:
            df = df.withColumn('ghost_windows > 0', df['memory_ghost_windows'] > 0)

        if 'memory_top_none_detached' in df.columns:
            df = df.withColumn('top(none)/detached > 0', df['memory_top_none_detached'] > 0)

        return df
 def test_working_wit_filters(self):
     priceFilter = col("UnitPrice") > 600
     descripFilter = instr(self.df.Description, "POSTAGE") >= 1
     df = self.df.where(
         self.df.StockCode.isin("DOT")).where(priceFilter | descripFilter)
     self.assertEqual(df.count(), 2)
예제 #10
0
    .show(5, False)

#2
print("2")
df.where("InvoiceNo = 536365")\
    .show(5, False)

#3
print("3")
df.where("InvoiceNo <> 536365")\
    .show(5, False)

#4
print("4")
priceFilter = col("UnitPrice") > 600
descriptionFilter = instr(df.Description, "WHITE") >= 1

df.where(df.StockCode.isin("DOT"))\
    .where(priceFilter | descriptionFilter)\
    .show()
""" =
SELECT *
FROM dfTable
WHERE StockCOde in ("DOT") 
AND (UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1)
"""

#5 Defines what is expensive
print("5")
DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
                 "/databricks/retail-data/by-day/2010-12-01.csv"))
retail_df.printSchema()
retail_df.createOrReplaceTempView("reatail_table")
#%%
"working with booleans "
from pyspark.sql.functions import expr, col

print(
    retail_df.filter(col('InvoiceNo') == 536365).select(
        "InvoiceNo", "Description").show(5, False))
#%%
"conditional statements working with booleans"
from pyspark.sql.functions import instr

priceFilter = col("UnitPrice") > 600  # filter condition 1
descripFilter = instr(retail_df.Description,
                      "POSTAGE") >= 1  # filter condition 2

print('or statement : \n')
print(
    retail_df.where(priceFilter | descripFilter).select(
        'InvoiceNo', 'UnitPrice').show(3, False))  #or statement

print('and statements : \n')

print(
    retail_df.where(priceFilter & descripFilter).select(
        'InvoiceNo', 'UnitPrice').show(3, False))  #or statement

print(
    retail_df.where(priceFilter).filter(descripFilter).select(
        'InvoiceNo', 'UnitPrice').show(3, False))  #or statement
def create_filter(sub_string, column_name):
    # return locate(sub_string.upper(), column_name).cast('boolean').alias('contains_' + sub_string)
    return (instr(column_name, sub_string.upper()) >= 1).alias('contains_' +
                                                               sub_string)
예제 #13
0
schema = StructType([
    StructField('user_id', LongType(), True),
    StructField('status', StringType(), True)
])

user_status = spark.read.csv(path=path,
                             encoding='UTF-8',
                             sep='\t',
                             schema=schema,
                             quote="",
                             mode='DROPMALFORMED',
                             ignoreLeadingWhiteSpace=True)

users_suspended = user_status.filter(
    F.instr(user_status.status, 'Account Suspended') > 0).select(
        'user_id').cache()

# ## Twitter datasets
# 0. User vs. URLs
# 0. User vs. Mentions
# 0. User vs. Hashtags

# In[2]:

# tweets = spark.read.parquet('/user/aralytics-parquet/2015/01')

# tweets_count   = tweets.count()
# users_count    = tweets.select('user.id').distinct().count()
# hashtags_count = tweets.select(F.explode('entities.hashtags.text')).distinct().count()
# urls_count     = tweets.select(F.explode('entities.urls.expanded_url')).distinct().count()
예제 #14
0
df.where(col("InvoiceNo") == "536365")\
        .select("InvoiceNo","Description")\
        .show(5,False)

#another way

df.where("InvoiceNo = 536367")\
        .show(5,False)

#using multiple filters

from pyspark.sql.functions import instr

priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

#specify a boolean column

DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter|descripFilter))\
        .where("isExpensive")\
        .select("UnitPrice","isExpensive")\
        .show(5,False)

#using expressions

from pyspark.sql.functions import expr
예제 #15
0
    regexp_extract(col("Description"), extract_str, 1)
    .alias("color_cleaned"),
    col("Description"))\
.show(2)

"""
SELECT
    regexp_extract(Description, '(BLACK|WHITE|RED|GREEN|BLUE)', 1),
    Description
FROM 
    dfTable
"""

#4 filter if it contains that string
print("4")
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1

df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
    .filter("hasSimpleColor")\
    .select("Description")\
    .show(3, False)

"""
SELECT 
    Description
FROM 
    dfTable
WHERE
    instr(Description, 'BLACK') >= 1 OR instr(Description, 'WHITE') >= 1
"""
예제 #16
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace,col,translate,regexp_extract,instr
spark = SparkSession.builder.appName("Pyspark example").getOrCreate()

df= spark.read.format("csv").option("header","true").option("inferSchema","true").load("C:/Users/Lenovo/Desktop/spark_data/retail_store.csv")
#'regexp_replace' is used to replace substitute color names with NOCOLOR
str1="BLACK|WHITE|RED|BLUE|GREEN"
df.select(regexp_replace(col("Description"),str1,"NOCOLOR").alias("no_color_column"),col("Description")).show(5)

#'translate' function is to replace given characters with other characters
df.select(translate(col("Description"),"ABCD","1234"),col("Description")).show(5)

#'regexp_extract' is used to extract values
df.select(regexp_extract(col("Description"),str1,0).alias("color"),col("Description")).show(5)

#'instr' function checks for the existance of a value
containsRed= instr(col("Description"),"RED")>=1
containsWhite= instr(col("Description"),"WHITE")>=1
df.withColumn("hasColor",containsWhite| containsRed).where("hasColor").select("Description").show(5)
예제 #17
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, column, lit, avg, lpad, regexp_extract, instr

if __name__ == '__main__':
    spark = SparkSession.builder.appName("learning").master(
        "local").getOrCreate()
    df = spark.read.format('csv')\
                   .option('sep', ';')\
                   .option('header', 'true')\
                   .load('user.csv')

    df.select(lit(5), lit(5.5), lit("5.5"), lit("hola")).show()
    print(
        df.select(lit(5), lit(5.5), lit("5.5"), lit("5.5"),
                  lit("hola")).dtypes)

    df.select(lpad(lit("HELLO"), 3, " ")).show()

    df.select(col('name').isin(['jorge'])).show()
    df.select(instr(col('name'), 'Jorge')).show()
예제 #18
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, column, lit, avg, monotonically_increasing_id, rand, locate, instr

if __name__ == '__main__':
    spark = SparkSession.builder.appName("learning").master(
        "local").getOrCreate()
    df = spark.read.format('csv')\
                   .option('sep', ';')\
                   .option('header', 'true')\
                   .load('user.csv')

    df.select(instr(col('name'), 'Jorge'), locate('Jorge', col('name'))).show()
    df.select(rand().alias("random")).where(expr("random > 0")).show()
예제 #19
0
def MyTransform(glueContext, dfc) -> DynamicFrameCollection:
    dyf_soggetti = dfc.select("Soggetti")
    dyf_contratti = dfc.select("Contratti")
    dyf_credito = dfc.select("Credito")
    dyf_prodotti = dfc.select("Prodotti")
    dyf_punti_di_fornitura = dfc.select("PuntiDiFornitura")

    #### Deduplica Soggetti
    deduplica_soggetti = dyf_soggetti.toDF()
    windowSpec=Window.partitionBy(deduplica_soggetti.nome,deduplica_soggetti.cognome).\
    orderBy(F.col("key_soggetti").desc())
    deduplica_soggetti = deduplica_soggetti.withColumn(
        "rank",
        F.row_number().over(windowSpec)).filter(F.col("rank") == 1)

    ### Numero Contratti per soggetto
    df_contratti_per_soggetto = dyf_contratti.toDF()
    df_contratti_per_soggetto = df_contratti_per_soggetto.groupBy(
        "key_soggetti").count()

    ### Debito Medio per soggetto
    df_debito_medio_per_cliente = dyf_credito.toDF()
    df_debito_medio_per_cliente=df_debito_medio_per_cliente.withColumn("d_importo", (F.col("importo").\
            substr(F.lit(1), F.instr(F.col("importo"), '€')-2)).cast('double'))
    df_debito_medio_per_cliente = df_debito_medio_per_cliente.groupBy(
        "key_soggetti").agg(F.mean("d_importo"))
    df_debito_medio_per_cliente = df_debito_medio_per_cliente.withColumnRenamed(
        "avg(d_importo)", "debito_medio")

    ### ELE e Gas Medio, minimo anno_prima_attivazione_fornitura su contratti attivo

    df_contratti = dyf_contratti.toDF()
    df_contratti=df_contratti.withColumn("ts_data_attivazione_fornitura",F.to_timestamp (df_contratti.data_attivazione_fornitura)).\
        withColumn ("ts_data_cessazione_fornitura",F.to_timestamp (df_contratti.data_cessazione_fornitura)).\
        drop("key_punti_di_fornitura","data_cessazione_fornitura","codice_contratto","key_contratti",
            "canale_di_vendita","anno_prima_attivazione_fornitura")

    cd = F.current_timestamp()
    df_contratti=df_contratti.filter (df_contratti.ts_data_cessazione_fornitura >= cd).\
            filter (df_contratti.ts_data_attivazione_fornitura <= cd)

    df_prodotti = dyf_prodotti.toDF()
    df_prodotti=df_prodotti.withColumn("ts_data_inizio_validita",F.to_timestamp (df_prodotti.data_inizio_validita)).\
        withColumn ("ts_data_fine_validita",F.to_timestamp (df_prodotti.data_fine_validita)).\
        drop ('data_inizio_validita','data_fine_validita','key_prodotti')
    df_prodotti = df_prodotti.withColumnRenamed("nome_prodotto",
                                                "nome_commerciale")
    df_prodotti=df_prodotti.withColumn("ELE", (df_prodotti.f0+ df_prodotti.f1+ df_prodotti.f2+ df_prodotti.f3)/4).\
        drop('f0','f1','f2','f3')

    df_tariffe_contratti=df_contratti.join (df_prodotti,"nome_commerciale").\
        withColumn ("realGas",F.when (F.col('vettore')=='GAS',df_prodotti.gas).otherwise (None)).\
        withColumn ("realEle",F.when (F.col('vettore')=='ELE',df_prodotti.ELE).otherwise (None)).\
        drop ('gas','ELE')

    df_tariffe_soggetti=df_tariffe_contratti.groupby (df_tariffe_contratti.key_soggetti).\
        agg(F.min("data_attivazione_fornitura"),F.mean("realGas"),F.mean("realEle"))

    df_tariffe_soggetti=df_tariffe_soggetti.withColumnRenamed("min(data_attivazione_fornitura)","data_attivazione_fornitura").\
        withColumnRenamed("avg(realGas)","media_GAS").\
        withColumnRenamed("avg(realEle)","media_ELE")

    ## Calcolo dell'indice di churn e del canale di contatto preferenziale
    df_contratti = dyf_contratti.toDF()
    df_soggetti_canale_vendita=df_contratti.groupBy (df_contratti.key_soggetti,df_contratti.canale_di_vendita).\
            agg (F.count(df_contratti.canale_di_vendita))

    windowSpec=Window.partitionBy("key_soggetti").\
            orderBy(F.col("count(canale_di_vendita)").desc())

    df_soggetti_canale_vendita=df_soggetti_canale_vendita.\
        withColumn("rank",F.row_number().over (windowSpec)).filter (F.col("rank")==1)

    df_soggetti_canale_vendita = df_soggetti_canale_vendita.drop(
        'count(canale_di_vendita)', 'rank')
    df_contratti_churn = df_contratti.withColumn(
        "hadChurn",
        F.when(F.col('data_cessazione_fornitura') < cd, 1).otherwise(0))
    df_contratti_churn = df_contratti_churn.groupBy("key_soggetti").agg(
        F.sum("hadChurn"))
    df_contratti_churn=df_contratti_churn.\
        withColumn("Churn",F.when (F.col('sum(hadChurn)')>=1,1).otherwise (0)).\
        drop("sum(hadChurn)")

    ### Unpivot tabella regioni
    df_contratti = dyf_contratti.toDF()
    df_fornitura = dyf_punti_di_fornitura.toDF()
    df_fonitura_per_contratto=df_contratti.join(df_fornitura,"key_punti_di_fornitura").\
        groupBy ("key_soggetti","regione").\
        pivot("regione").\
        agg (F.count("key_punti_di_fornitura"))

    ### Preparazione Output
    output=deduplica_soggetti.join (df_contratti_per_soggetto,"key_soggetti").\
        join(df_debito_medio_per_cliente,"key_soggetti").\
        join(df_tariffe_soggetti,"key_soggetti").\
        join(df_contratti_churn,"key_soggetti").\
        join (df_soggetti_canale_vendita,"key_soggetti").\
        join (df_fonitura_per_contratto,"key_soggetti")

    dyf_output = DynamicFrame.fromDF(output, glueContext, "output")
    return (DynamicFrameCollection({"CustomTransform0": dyf_output},
                                   glueContext))
예제 #20
0
    def read_spark_df_from_msexchange_data_store(self, **args):
        url = args["hbase_url"]

        r = requests.get(url)

        # Converting api data in json file
        try:
            d = r.json()

        except:
            print("Invalid URL")

        # Checking for data availability
        if len(d) == 0:
            print(
                "There are no events to process. Please enter a different search criteria in the url."
            )

        # Converting API data into Spark Dataframe
        print("Reading the data from profiler...")
        spark = SparkSession.builder.appName(
            'mseapi').enableHiveSupport().getOrCreate()
        sc = spark.sparkContext
        tsRDD = sc.parallelize(d)
        df_mail = spark.read.option('multiline', "true").json(tsRDD)
        total_evt_count = df_mail.count()
        print("Total number of records: " + str(total_evt_count))

        if total_evt_count > 0:
            mail_len = f.udf(lambda s: len(s), LongType())
            mail_sum = f.udf(lambda s: sum(s), LongType())
            # mail_mean  = f.udf(lambda s: round(mean(s),4), FloatType())
            # mail_stdev = f.udf(lambda s: round(stdev(s),4), FloatType())


            df_mail_grp = df_mail.filter(f.length(f.trim(df_mail["mail_size"]))>0)\
                            .withColumn("check", f.when(f.instr(df_mail["mail_size"],',') == 1,f.substring_index(df_mail["mail_size"],',',-1)).otherwise(df_mail["mail_size"]))\
                            .withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\
                            .withColumn("mail_size", f.regexp_replace('check', ' ', ''))\
                            .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',')
                                                            .cast(ArrayType(IntegerType())).alias("email_size"),
                                                    f.sum("ext_sndrs").alias("ext_sndrs"))\
                            .withColumn("no_of_emails", mail_len("email_size"))\
                            .withColumn("tot_email_size", mail_sum("email_size"))\
                            .withColumn("avg_email_size", f.round(f.col("tot_email_size")/ f.col("no_of_emails"),4))\
                            .drop("email_size")
            #.withColumn("email_size_mean", mail_mean("email_size"))\
            #.withColumn("email_size_stdev", f.when(mail_len("email_size") > 1,mail_stdev("email_size")))\

            # df_mail_grp = df_mail.filter(f.length(f.trim(df_mail["mail_size"]))>0)\
            #                 .withColumn("check", f.when(f.instr(df_mail["mail_size"],',') == 1,f.substring_index(df_mail["mail_size"],',',-1)).otherwise(df_mail["mail_size"]))\
            #                 .withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\
            #                 .withColumn("mail_size", f.regexp_replace('check', ' ', ''))\
            #                 .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',')
            #                                                 .cast(ArrayType(IntegerType())).alias("email_size"),
            #                                         f.sum("ext_sndrs").alias("ext_sndrs"))\
            #                 .withColumn("no_of_emails", mail_len("email_size"))\
            #                 .withColumn("tot_email_size", mail_sum("email_size"))\
            #                 .withColumn("avg_email_size", f.round(f.col("tot_email_size")/ f.col("no_of_emails"),4))\
            #                 .drop("email_size")
            #                 #.withColumn("email_size_mean", mail_mean("email_size"))\
            #                 #.withColumn("email_size_stdev", f.when(mail_len("email_size") > 1,mail_stdev("email_size")))\

            # df_mail_grp = df_mail.withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\
            #                     .withColumn("mail_size", f.regexp_replace('mail_size', ' ', ''))\
            #                     .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',')
            #                                                     .cast(ArrayType(IntegerType())).alias("email_size"),
            #                                             f.sum("ext_sndrs").alias("ext_sndrs"))\
            #                     .withColumn("no_of_emails", mail_len("email_size"))\
            #                     .withColumn("tot_email_size", mail_sum("email_size"))\
            #                     .withColumn("email_size_mean", mail_mean("email_size"))\
            #                     .withColumn("email_size_stdev", mail_stdev("email_size"))\
            #                     .drop("email_size")
            df_mail_grp.show(3)
            return df_mail_grp

        else:
            schema = StructType([])
            sqlContext = SQLContext(sc)
            sdf = sqlContext.createDataFrame(sc.emptyRDD(), schema)
            return sdf
df.select(lit(5), lit("five"), lit(5.0))


# COMMAND ----------

from pyspark.sql.functions import col
df.where(col("InvoiceNo") != 536365)\
  .select("InvoiceNo", "Description")\
  .show(5, False)


# COMMAND ----------

from pyspark.sql.functions import instr
priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()


# COMMAND ----------

from pyspark.sql.functions import instr
DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
  .where("isExpensive")\
  .select("unitPrice", "isExpensive").show(5)


# COMMAND ----------
예제 #22
0
def get_top_words(dataset, signatures):
    # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
    # Or translate comments in other languages using the free Microsoft Translate API.
    sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))

    if sentenceData.rdd.isEmpty():
        return dict()

    # Tokenize comments.
    tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
    wordsData = tokenizer.transform(sentenceData)

    # Remove duplicate words from comments.
    wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])

    if wordsData.rdd.isEmpty():
        print("[WARNING]: wordsData is empty, sentenceData wasn't.")
        return dict()

    # Clean comment words by removing puntuaction and stemming.
    def clean_word(w):
        return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))

    wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])

    # XXX: Useless with TF-IDF?
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    cleanWordsData = remover.transform(wordsData)

    cv = CountVectorizer(inputCol='filtered', outputCol='features')
    model = cv.fit(cleanWordsData)
    featurizedData = model.transform(cleanWordsData)

    idf = IDF(inputCol='features', outputCol='tfidf_features')
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()

    return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
#주어진 문자를 다른 문자로 치환
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "WHI", "123")).show(2)

#color name 추출
from pyspark.sql.functions import regexp_extract

extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
    regexp_extract(col("Description"), extract_str,
                   1).alias("color_clean")).show(6)

#data의 존재여부 확인
#instr
from pyspark.sql.functions import instr
containBlack = instr(col("Description"), "BLACK") >= 1
df.withColumn("HasSimpleColor",containBlack)\
  .where("HasSimpleColor")\
  .select("Description").show(15,False)

#인수의 개수가 동적으로 변할 때
from pyspark.sql.functions import expr, locate
simpleColors = ["black", "white", "red", "green", "blue"]


def color_locator(column, color_string):
    return locate(color_string.upper(), column)\
            .cast("boolean")\
            .alias("is_" + color_string)

예제 #24
0
                                    rank().over(spec)).withColumn(
                                        "dense_rank_val",
                                        dense_rank().over(spec)).show()

# COMMAND ----------

from pyspark.sql.functions import col, substring, substring_index, instr, split, concat_ws, repeat
from pyspark.sql.types import StringType
#substring
#orders_new_col.show()
func_df = orders_new_col.select(
    'order_status',
    substring('order_status', 1, 2).alias("sub"),
    substring_index('order_status', "E", -3).alias("sub_ind")).select(
        "*",
        instr('sub_ind', 'E').alias("instr_val"),
        split('order_status', "_")[0].alias("split_val")).select(
            "*",
            concat_ws("|", "order_status", "sub").alias("concat_val"))
func_df.withColumn("repeat_val", repeat("instr_val", 3)).select(
    "*",
    concat_ws("|", *func_df.columns).alias("conc_ws")).show(truncate=False)
#orders_new_col.select(substring_index('order_status', "_", 2)).show()
#list_1 = ["col_1", "col_2"]
#df_1 = spark.createDataFrame(list_1, StringType())
#df_1.select(substring_index("value", "_", 1)).show()

# COMMAND ----------

#Date
from pyspark.sql.functions import current_timestamp, current_date, date_format, dayofyear, year, month, date_add, date_sub, datediff, add_months, months_between, next_day, last_day, date_trunc, lit