Пример #1
0
from pyspark import SparkContext

from pyspark.sql import SparkSession, types

sparkSess = SparkSession.builder.appName('post_history').getOrCreate()
sc = sparkSess.sparkContext
bdschema = types.StructType([
    types.StructField('index', types.IntegerType()),
    types.StructField('id', types.IntegerType()),
    types.StructField('creation_date', types.StringType()),
    types.StructField('post_id', types.IntegerType()),
    types.StructField('post_history_type_id', types.IntegerType()),
    types.StructField('user_id', types.IntegerType())
])

sbad = sparkSess.read.format("s3selectCSV").schema(bdschema).options(
    header="true").options(delimiter="|").options(
        quote='\"').load("s3://bigdata-4/post_history.csv").select(
            "index", "id", "creation_date", "post_id", "post_history_type_id",
            "user_id")

sbad.write.mode("append").parquet("s3://bigdata-4/post_history_new/")
    def transform(self, sources: dict) -> DataFrame:
        """
        Fact Invoice records and attributes from dataA Sources
        """

        rpttax = self.read_source(source=sources['rpt_tax'])
        inv = self.invoice_dataframe(sources['dataB_urbcrb_invoice'])
        cer = self.read_source(source=sources['currency_exchange_master'])
        cer = fixCurExchangeToAvg(self, cer)
        rsc = self.read_source(source=sources['recycle_standard_cost'])
        rmr = self.read_source(source=sources['recycle_mill_rebates'])
        exclplnt = self.read_source(source=sources['rpt_exclplnt'])
        exclbillto = self.read_source(source=sources['rpt_exclbillto'])

        salesmanexc = self.read_source(source=sources['rpt_salesmanexc'])
        salesmanexc = self.addColPrefix(salesmanexc, "salesmanexc")

        otm_shipment = self.read_source(source=sources['shipment'])
        otm_shipment_cost = self.read_source(source=sources['shipment_cost'])
        otm_shipment_refnum = self.read_source(
            source=sources['shipment_refnum'])
        otm_shipment_status = self.read_source(
            source=sources['shipment_status'])
        otm_shipment_stop = self.read_source(source=sources['shipment_stop'])
        otm_shipment_stop_remark = self.read_source(
            source=sources['shipment_stop_remark'])

        freight_rate_val = self.read_source(source=sources['rate_validation'])
        freight_rate_plant_mapping = self.read_source(
            source=sources['plant_mapping'])
        freight_rate_slr = self.read_source(
            source=sources['supplemental_lane_rates'])

        dmat = self.read_source(source=sources['dim_material'])
        dmat = self.addColPrefix(dmat, "dmat")

        df_otm_freight = self.dataB_process_otm(otm_shipment,
                                                otm_shipment_cost,
                                                otm_shipment_refnum,
                                                otm_shipment_status,
                                                otm_shipment_stop,
                                                otm_shipment_stop_remark, inv)

        df_freight_rate_estimates = self.dataB_process_std_freight_rates(
            freight_rate_val, freight_rate_plant_mapping)

        df_freight_rate_estimates_slr = self.dataB_process_std_freight_rates_slr(
            freight_rate_slr)

        rmr = self.addColPrefix(rmr, "rmr")
        rsc = self.addColPrefix(rsc, "rsc")
        cer = self.addColPrefix(cer, "cer")
        rpttax = self.addColPrefix(rpttax, "rpttax")
        df_freight_rate_estimates = self.addColPrefix(
            df_freight_rate_estimates, "fre")
        df_freight_rate_estimates_slr = self.addColPrefix(
            df_freight_rate_estimates_slr, "fre_slr")

        df = inv.select(
            'invoice_date', 'allowed_disc', 'bill_to', 'bus_unit',
            'bus_unit_name', 'channel', 'sq_ft', 'line', 'caliper',
            'charge_desc1', 'currency_code', 'curr_conv', 'extended_amount',
            'bill_to_name', 'gen_ledg', 'grade', 'grade_desc', 'invoice',
            'iptmeta_corrupt_record', 'iptmeta_extract_dttm', 'jde_bill_to',
            'jde_ship_to', 'lbs', 'qty', 'length', 'order_format', 'plant',
            'plant_name', 'salesman', 'salesman_name', 'substrate', 'ship_to',
            'ship_to_outs_cvtr', 'width', 'price_uom_desc', 'bol_number',
            'ship_to_city', 'ship_to_ship_to', 'qty_uom_desc', 'end_cust_desc',
            'form', 'trans_mode', 'rept_month', 'rept_year')

        df = df.join(salesmanexc,
                     [df.salesman == salesmanexc.salesmanexc_salesman],
                     'left_outer')
        df = df.withColumn(
            'sales_representative',
            F.coalesce(df.salesmanexc_salesman_name_override, df.salesman_name,
                       F.lit(MISSING_DESC)))
        df = df.withColumnRenamed('salesman', 'sales_rep_id')

        df = df.withColumn("billing_system", F.lit('dataB'))
        # The caliper needs to be calculated before creating the material id
        # because the caliper is used to build the material id.
        df = df.withColumn("caliper", F.col("caliper") * 1000.0)
        # remove records with calipers greater >= 200
        df = self.dataB_filter_caliper(df)
        df = self.dataB_material_id(df)
        df = dataB_sale_form(df)

        df = df.withColumn("inv_date",
                           F.col('invoice_date').cast(T.DateType()))
        df = df.withColumn("inv_month", F.month(F.col('inv_date')))
        df = df.withColumn("inv_year", F.year(F.col('inv_date')))
        df = df.withColumn("invoice_period",
                           F.date_format(F.col("invoice_date"),
                                         "MMyyyy"))  #Change format to MMyyyy
        df = df.withColumn("invoice_period", (df.invoice_period.cast(
            T.StringType()))[0:6])  #invoice_period lenght_max 6

        rpttax = rpttax.withColumn("rpttax_plant",
                                   F.col('rpttax_plant').cast(T.IntegerType()))
        rpttax = rpttax.withColumnRenamed("rpttax_grade", "rpttax_grade_code")

        df = df.join(rsc, [
            df.plant == rsc.rsc_plant, df.grade == rsc.rsc_grade_code,
            df.caliper == rsc.rsc_caliper, df.sale_form == rsc.rsc_ship_form
        ], 'left_outer')
        df = df.join(rpttax, [
            df.grade == rpttax.rpttax_grade_code, df.plant
            == rpttax.rpttax_plant
        ], 'left_outer')
        df = df.join(rmr, [
            df.plant == rmr.rmr_plant_id, df.invoice == rmr.rmr_invoice_id,
            df.line == rmr.rmr_invoice_line_number
        ], 'left_outer')
        df = df.join(cer, [
            cer.cer_currency_code_from == df.currency_code, cer.cer_cur_year
            == df.inv_year, cer.cer_cur_month == df.inv_month
        ], 'left_outer')
        # Join for OTM to use estimates only
        df = df.join(df_otm_freight,
                     [df.bol_number == df_otm_freight.bol_number_join],
                     'left_outer')
        # This code joins the freight rates but includes OTM checks so only
        # records without OTM matches are give values
        df = df.join(df_freight_rate_estimates, [
            df.plant == df_freight_rate_estimates.fre_plant,
            F.lower(F.trim(df.ship_to_city)) == F.lower(
                F.trim(df_freight_rate_estimates.fre_dcity)),
            F.lower(F.trim(df.ship_to_ship_to)) == F.lower(
                F.trim(df_freight_rate_estimates.fre_dstate)),
            df_otm_freight.freight_rate_per_ton.isNull()
        ], 'left_outer')
        # This code joins the freight rates supplemental lanes but includes OTM checks
        # so only records that don't match OTM and the Freight Rates files are given
        # values
        df = df.join(df_freight_rate_estimates_slr, [
            df.plant == df_freight_rate_estimates_slr.fre_slr_plant,
            F.lower(F.trim(df.ship_to_city)) == F.lower(
                F.trim(df_freight_rate_estimates_slr.fre_slr_dcity)),
            F.lower(F.trim(df.ship_to_ship_to)) == F.lower(
                F.trim(df_freight_rate_estimates_slr.fre_slr_dstate)),
            df_otm_freight.freight_rate_per_ton.isNull(),
            df_freight_rate_estimates.fre_estimate_freight_rate_per_ton.isNull(
            )
        ], 'left_outer')
        # Select that includes OTM calculations
        df = df.select(
            df.allowed_disc, df.bill_to, df.bus_unit, df.bus_unit_name,
            df.charge_desc1, df.caliper, df.channel, df.width, df.length,
            df.currency_code, df.curr_conv, df.lbs, df.qty, df.material_id,
            df.sales_rep_id, df.sales_representative, df.billing_system,
            df.gen_ledg, df.extended_amount, df.grade, df.line, df.invoice,
            df.invoice_date, df.invoice_period, df.plant, df.plant_name,
            df.bill_to_name, df.ship_to_city, df.ship_to_ship_to,
            df.end_cust_desc, df.rept_month, df.rept_year, df.jde_ship_to,
            df.jde_bill_to, df.ship_to, df.ship_to_outs_cvtr, df.sq_ft,
            df.bol_number, df.trans_mode, rsc.rsc_msf, rmr.rmr_rebate_amount,
            rpttax.rpttax_end_market, rpttax.rpttax_grade_code,
            rpttax.rpttax_plant, rpttax.rpttax_product_family,
            rpttax.rpttax_product_group, rpttax.rpttax_product_name,
            rpttax.rpttax_substrate, cer.cer_conversion_rate_multiplier,
            df.price_uom_desc, df.qty_uom_desc,
            df_otm_freight.freight_rate_per_ton,
            df_freight_rate_estimates.fre_estimate_freight_rate_per_ton,
            df_freight_rate_estimates_slr.fre_slr_estimate_freight_rate_per_ton
        )

        df = self.dataB_filter_plant(df, exclplnt)
        df = self.dataB_filter_billtoname_jdebillto(df, exclbillto)

        df = df.withColumn(
            'actual_tons',
            F.coalesce((F.col('lbs') / 2000.0), F.lit(MISSING_NUMBER)))
        df = df.withColumn(
            'fx_conversion_to_usd',
            F.coalesce(
                F.when((df.currency_code == 'USD') | (df.currency_code == '')
                       | df.currency_code.isNull(), 1).otherwise(
                           df.cer_conversion_rate_multiplier.cast(
                               T.DoubleType())), F.lit(MISSING_NUMBER)))
        df = dataB_adjust_currency_fields(df)
        df = self.dataB_claims(df)
        df = self.dataB_discounts(df)

        # The following code block includes the OTM calculations for actual rates

        # Determine the approach for calculating the Freight Invoice value
        # and fill out the flag.  Includes CPU filtering.
        df = df.withColumn(
            "freight_invoice_calc",
            F.when(
                F.lower(F.trim(df.trans_mode)) == F.lit('cpu'),
                F.lit('actual_cpu')).when(df.freight_rate_per_ton.isNotNull(
                ), F.lit('actual')).when(
                    df.freight_rate_per_ton.isNull()
                    & df.fre_estimate_freight_rate_per_ton.isNotNull(),
                    F.lit('estimate')).when(
                        df.freight_rate_per_ton.isNull()
                        & df.fre_estimate_freight_rate_per_ton.isNull()
                        & df.fre_slr_estimate_freight_rate_per_ton.isNotNull(),
                        F.lit('estimate_slr')).otherwise(
                            F.lit(NOT_APPLICABLE_CODE)))

        # Using the flag fill in the freight_invoice value.
        df = df.withColumn(
            "freight_invoice",
            F.when(
                df.freight_invoice_calc == F.lit('actual_cpu'), F.lit(0)).when(
                    df.freight_invoice_calc == F.lit('actual'),
                    df.freight_rate_per_ton * df.actual_tons).when(
                        df.freight_invoice_calc == F.lit('estimate'),
                        df.fre_estimate_freight_rate_per_ton *
                        df.actual_tons).when(
                            df.freight_invoice_calc == F.lit('estimate_slr'),
                            df.fre_slr_estimate_freight_rate_per_ton *
                            df.actual_tons).otherwise(F.lit(MISSING_NUMBER)))

        df = self.dataB_freight_upcharge(df)
        df = self.dataB_gross_price(df)
        df = df.withColumn('report_month', F.lpad(df.rept_month, 2, '0'))
        df = df.withColumnRenamed('rept_year', 'report_year')
        df = df.withColumn("other_deductions", F.lit(0))
        df = self.dataB_rebates(df)
        df = df.withColumn("service_allowances", F.lit(0))
        df = df.withColumn(
            'net_price',
            F.coalesce(
                (F.col('gross_price') + F.col('discounts') + F.col('rebates') +
                 F.col('claims') + F.col('freight_upcharge') +
                 F.col('other_deductions') + F.col('service_allowances')),
                F.lit(MISSING_NUMBER)))

        df = df.withColumn('cp_channel', F.lit(0))
        df = df.withColumn('cp_mode', F.lit(0))
        df = df.withColumn('cp_sales_region', F.lit(0))

        df = df.withColumnRenamed('invoice', 'invoice_number')
        df = df.withColumnRenamed('line', 'invoice_line_number')
        df = df.withColumnRenamed('plant_name', 'invoice_location')

        df = self.dataB_invoiced_currency(df)
        df = self.dataB_sale_type(df)
        df = self.dataB_ship_from_loc_number(df)

        df = df.withColumn("invoice_dim_location_id",
                           F.expr(hash_columns(['plant'])))
        df = df.withColumn("ship_from_dim_location_id",
                           F.expr(hash_columns(['ship_from_loc_number'])))
        df = df.withColumnRenamed('rpttax_end_market', 'end_market')
        df = prime_enrich(df, quality_class_column=None)
        df = df.withColumn('sales_order_number', F.lit('0'))
        df = df.withColumn(
            "ship1_dim_material_id",
            F.expr(
                hash_columns(['billing_system', 'material_id', 'end_market'])))
        df = df.withColumn(
            'ship_dim_customer_id',
            F.expr(
                hash_columns(
                    ['billing_system', 'jde_ship_to', 'end_cust_desc'])))
        df = df.withColumn(
            'sold_dim_customer_id',
            F.expr(
                hash_columns(
                    ['billing_system', 'jde_bill_to', 'end_cust_desc'])))
        df = df.withColumn('brand_dim_customer_id', F.lit(MISSING_STRING_ID))

        # Joining the processed material dimension to retrieve the calculated nominal_basis_weight
        # value to be used to calculate nominal_tons.
        df = df.join(dmat,
                     [df.ship1_dim_material_id == dmat.dmat_dim_material_id],
                     'left_outer')
        df = self.dataB_msf(df)
        df = df.withColumn(
            'nominal_tons',
            F.coalesce(((df.dmat_nominal_basis_weight * df.msf) / 2000.0),
                       F.lit(MISSING_NUMBER)))

        df = df.withColumn(
            'subset',
            F.coalesce(
                F.when(df.rpttax_product_group.isNotNull(),
                       df.rpttax_product_group), F.lit(MISSING_DESC)))
        df = df.withColumn(
            'commercial_print_channel',
            F.coalesce(F.when(df.channel.isNotNull(), df.channel),
                       F.lit(MISSING_DESC)))
        df = df.withColumn(
            'invoice_location_number',
            F.coalesce(F.when(df.plant.isNotNull(), df.plant),
                       F.lit(MISSING_NUMBER)))

        df = df.withColumn("invoice_source_type", F.lit(NOT_APPLICABLE_CODE))
        df = df.withColumn("invoice_line_code", F.lit(NOT_APPLICABLE_CODE))
        df = df.withColumn('iptmeta_source_system', F.lit('dataB'))
        df = self.dataB_product_sold_flag(df)
        df = df.withColumn("commercial_print_mode", F.lit(NOT_APPLICABLE_DESC))
        df = df.withColumn("commercial_print_region",
                           F.lit(NOT_APPLICABLE_DESC))
        df = df.withColumnRenamed("qty", "invoice_volume")
        df = df.withColumnRenamed("qty_uom_desc", "invoice_uom_id")
        df = df.withColumn(
            'standard_cost',
            F.coalesce(
                F.when(df.rsc_msf.isNotNull(), df.rsc_msf) * df.msf,
                F.lit(MISSING_NUMBER)))
        df = df.withColumn(
            'standard_gross_margin',
            F.coalesce(
                (df.net_price - (df.standard_cost + df.freight_invoice)),
                F.lit(MISSING_NUMBER)))
        df = df.withColumn('invoice_line_desc_1', F.lit(NOT_APPLICABLE_CODE))

        df = df.select(
            df.billing_system, df.invoice_number, df.invoice_line_number,
            df.invoice_period, df.invoice_source_type, df.invoice_line_code,
            df.iptmeta_source_system, df.product_sold_flag,
            df.commercial_print_channel, df.commercial_print_mode,
            df.fx_conversion_to_usd, df.grade, df.invoice_date,
            df.ship_from_dim_location_id, df.invoiced_currency,
            df.ship1_dim_material_id, df.prime, df.sales_order_number,
            df.sale_type, df.sales_representative, df.ship_dim_customer_id,
            df.sold_dim_customer_id, df.brand_dim_customer_id, df.subset,
            df.actual_tons, df.claims, df.discounts, df.freight_invoice,
            df.freight_invoice_calc, df.freight_upcharge, df.gross_price,
            df.msf, df.net_price, df.nominal_tons, df.other_deductions,
            df.rebates, df.service_allowances, df.standard_cost,
            df.standard_gross_margin, df.invoice_dim_location_id,
            df.commercial_print_region, df.invoice_volume, df.invoice_uom_id,
            df.bol_number, df.report_month, df.report_year, df.sales_rep_id,
            df.invoice_line_desc_1).distinct()

        return df
Пример #3
0

def _to_stype(tpe) -> X:
    if _is_col(tpe):
        inner = as_spark_type(_get_col_inner(tpe))
        return _Column(inner)
    inner = as_spark_type(tpe)
    if inner is None:
        return _Unknown(tpe)
    else:
        return _Scalar(inner)


# First element of the list is the python base type
_base = {
    types.StringType(): [str, 'str', 'string'],
    types.BinaryType(): [bytes],
    types.ByteType(): [np.int8, 'int8', 'byte'],
    types.ShortType(): [np.int16, 'int16', 'short'],
    types.IntegerType(): [int, 'int', np.int],
    types.LongType(): [np.int64, 'int64', 'long', 'bigint'],
    types.FloatType(): [float, 'float', np.float],
    types.DoubleType(): [np.float64, 'float64', 'double'],
    types.TimestampType(): [np.datetime64],
    types.DateType(): [datetime.date],
    types.BooleanType(): [bool, 'boolean', 'bool', np.bool],
}


def _build_type_dict():
    return dict([(other_type, spark_type) for (spark_type, l) in _base.items()
#!/usr/bin/env python3
"""Script to convert `pg_dump` directory data into parquet data. This script
also performs transformations to make the resulting aggregates easier to query
within Spark and BigQuery."""

import click
from pyspark.sql import Row, SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

# This schema is an intermediate schema that is used
METADATA_SCHEMA = T.StructType([
    T.StructField("aggregate_type", T.StringType(), False),
    T.StructField("ds_nodash", T.StringType(), False),
    T.StructField("table_id", T.IntegerType(), False),
])

DIMENSION_SCHEMA = T.StructType([
    T.StructField("os", T.StringType()),
    T.StructField("child", T.StringType()),
    T.StructField("label", T.StringType()),
    T.StructField("metric", T.StringType()),
    T.StructField("osVersion", T.StringType()),
    T.StructField("application", T.StringType()),
    T.StructField("architecture", T.StringType()),
])

AGGREGATE_SCHEMA = T.StringType()


@click.command("pg_dump_to_parquet")
def main(inputs, output):
    comments_schema = types.StructType([
        types.StructField('archived', types.BooleanType(), True),
        types.StructField('author', types.StructType(), True),
        types.StructField('author_flair_css_class', types.StringType(), True),
        types.StructField('author_flair_text', types.StringType(), True),
        types.StructField('body', types.StringType(), True),
        types.StructField('controversiality', types.LongType(), True),
        types.StructField('created_utc', types.StringType(), True),
        types.StructField('distinguished', types.StringType(), True),
        types.StructField('downs', types.LongType(), True),
        types.StructField('edited', types.StringType(), True),
        types.StructField('gilded', types.LongType(), True),
        types.StructField('id', types.StringType(), True),
        types.StructField('link_id', types.StringType(), True),
        types.StructField('name', types.StringType(), True),
        types.StructField('parent_id', types.StringType(), True),
        types.StructField('retrieved_on', types.LongType(), True),
        types.StructField('score', types.LongType(), True),
        types.StructField('score_hidden', types.BooleanType(), True),
        types.StructField('subreddit', types.StringType(), True),
        types.StructField('subreddit_id', types.StringType(), True),
        types.StructField('ups', types.LongType(), True),
        #types.StructField('year', types.IntegerType(), False),
        #types.StructField('month', types.IntegerType(), False),
    ])
    comments = spark.read.json(inputs, schema=comments_schema)
    averages = comments.groupby('subreddit').agg(
        functions.avg(comments['score']))
    #averages.explain()
    averages.write.csv(output, mode='overwrite')
import csv
import pandas as pd
from urllib.request import *
import getCodeSets as codesets
spark = SparkSession.builder.master("local[*]").config(
    "spark.executor.memory",
    "70g").config("spark.driver.memory", "50g").config(
        "spark.memory.offHeap.enabled",
        True).config("spark.memory.offHeap.size", "32g").config(
            "spark.driver.maxResultSize",
            "10g").appName("Load Labour Force Data").getOrCreate()
#conf = SparkConf().setAppName('reddit etl')
#sc = SparkContext(conf=conf)

immigration_schema = types.StructType([
    types.StructField('REF_DATE', types.StringType(), True),
    types.StructField('GEO', types.StringType(), True),
    types.StructField('In_migrants', types.IntegerType(), True),
    types.StructField('Out_migrants', types.IntegerType(), True),
])


# dtype={"REF_DATE": str, "GEO": str, "DGUID":str , "Labour force characteristics":str, "Sex":str, "Age group":str, \
#"Statistics":str, "Data type":str, "UOM":str, "UOM_ID":int, "SCALAR_FACTOR":str, "SCALAR_ID":int, "VECTOR":str, "COORDINATE":str, "VALUE":str, "STATUS":str, \
#"SYMBOL":str, "TERMINATE":str, "DECIMALS":int}
def download_extract_zip(url):
    """
    Download a ZIP file and extract its contents in memory
    yields (filename, file-like object) pairs
    """
    response = requests.get(url)
Пример #7
0
        StandardScaler(inputCol="vec_tweet_count", outputCol="ss_tweet_count")
    ]
    assembler = [VectorAssembler(inputCols=input_cols, outputCol='features')]
    pipeline = Pipeline(stages=tokenizer + remover + ngrams + cv + idf +
                        tweetvect + ss + assembler)
    return pipeline


if __name__ == "__main__":
    # create a SparkContext while checking if there is already SparkContext created
    try:
        sc = ps.SparkContext()
        sc.setLogLevel("ERROR")
        sqlContext = ps.sql.SQLContext(sc)
        print('Created a SparkContext')
    except ValueError:
        warnings.warn('SparkContext already exists in this scope')
    print('Retrieving Data from {}'.format(inputdir + "twitter_data.parquet"))
    df = sqlContext.read.parquet(inputdir + "twitter_data.parquet")
    reg_replaceUdf = f.udf(pre_processing, t.StringType())
    df = df.withColumn('tweet', reg_replaceUdf(df.text))
    print('Get Feature Vectors')
    pipeline = build_pipeline()
    pipelineFit = pipeline.fit(df)
    df = pipelineFit.transform(df)
    select_list = ["date_col", "features", "stock_price_col"]
    df = df.select([column for column in df.columns if column in select_list])
    print("Write to Parquet")
    df.write.parquet(outputdir + "processed_twitter_pyspark")
    sc.stop()
Пример #8
0
def py_morphy(tokens):
    from nltk.corpus import wordnet as wn
    nltk.data.path.append('/home/dxiang/nltk_data')
    if not isinstance(tokens, list):
        tokens = [tokens]
    modified_tokens = []
    for token in tokens:
        modified_token = wn.morphy(token)
        if modified_token is None:
            continue
        modified_tokens.append(modified_token)
    return modified_tokens


udf_morphy = functions.udf(py_morphy,
                           returnType=types.ArrayType(types.StringType()))


def classify_tokens(list_tokens):
    from nltk.corpus import wordnet as wn
    nltk.data.path.append('/home/dxiang/nltk_data')
    if not isinstance(list_tokens, list):
        list_tokens = [list_tokens]
    list_token = []
    for token in list_tokens:
        tag = wn.synsets(token)[0].pos(
        )  # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a/JJ', 's', 'r', 'n', 'v'
        if tag == 'n' and pos_tag([token])[0][1] == 'NN':
            noun = wn.synsets(token)[0]
            list_hypernyms = get_parent_classes(noun)
            if token == 'food' or token == 'drink' or 'food' in list_hypernyms or 'animal' in list_hypernyms or 'fruit' in list_hypernyms or 'alcohol' in list_hypernyms or 'beverage' in list_hypernyms:
Пример #9
0
import sys
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('reddit averages').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
assert spark.version >= '2.3'  # make sure we have Spark 2.3+

wiki_schema = types.StructType([
    types.StructField('lang', types.StringType()),
    types.StructField('page', types.StringType()),
    types.StructField('times_requested', types.LongType()),
    types.StructField('bytes', types.LongType())
])


def get_date(string):
    file_name = string[string.rfind('/'):-1]
    date = file_name[file_name.find('-') + 1:file_name.rfind('-') + 3]
    return date


udf = functions.UserDefinedFunction(lambda x: get_date(x), types.StringType())


def main(in_directory, out_directory):
    wiki_data = spark.read.csv(in_directory, sep=" ",
                               schema=wiki_schema).withColumn(
                                   'filename', functions.input_file_name())
Пример #10
0
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import spacy
from collections import Counter
from string import punctuation

sid = SentimentIntensityAnalyzer()

#if you've downloaded the medium version use
#nlp = spacy.load("en_core_web_md")

#if you've downloaded the largest version use
nlp = spacy.load("en_core_web_lg")

get_twitch_schema = tp.StructType([
    tp.StructField(name='username', dataType=tp.StringType(), nullable=True),
    tp.StructField(name='timestamp', dataType=tp.LongType(), nullable=True),
    tp.StructField(name='mex', dataType=tp.StringType(), nullable=True),
    tp.StructField(name='engagement', dataType=tp.FloatType(), nullable=True),
    tp.StructField(name='source', dataType=tp.StringType(), nullable=True)
])


def get_sentiment(text):
    value = sid.polarity_scores(text)
    value = value['compound']
    return value


def get_keyword(text):
    result = []
Пример #11
0
    def list(self, provider, path=None, **kwargs):
        df_schema = T.StructType([
            T.StructField('name', T.StringType(), True),
            T.StructField('type', T.StringType(), True)
        ])

        df_empty = self.context.createDataFrame(data=(), schema=df_schema)

        md = Resource(path, provider, **kwargs)

        try:
            if md['service'] in ['local', 'file']:
                lst = []
                rootpath = md['url']
                for f in os.listdir(rootpath):
                    fullpath = os.path.join(rootpath, f)
                    if os.path.isfile(fullpath):
                        obj_type = 'FILE'
                    elif os.path.isdir(fullpath):
                        obj_type = 'DIRECTORY'
                    elif os.path.islink(fullpath):
                        obj_type = 'LINK'
                    elif os.path.ismount(fullpath):
                        obj_type = 'MOUNT'
                    else:
                        obj_type = 'UNDEFINED'

                    obj_name = f
                    lst += [(obj_name, obj_type)]

                if lst:
                    df = self.context.createDataFrame(lst, ['name', 'type'])
                else:
                    df = df_empty

                return df

            elif md['service'] in ['hdfs', 's3a']:
                sc = self.context._sc
                URI = sc._gateway.jvm.java.net.URI
                Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
                FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem

                parsed = urnparse(md['url'])
                if md['service'] == 's3a':
                    path = parsed.path.split('/')
                    url = 's3a://' + path[0]
                    path = '/' + '/'.join(path[1:]) if len(path) > 1 else '/'

                if md['service'] == 'hdfs':
                    host_port = f"{parsed.host}:{parsed.port}" if parsed.port else parsed.hosts
                    url = f'hdfs://{host_port}'
                    path = '/' + parsed.path

                try:
                    fs = FileSystem.get(URI(url),
                                        sc._jsc.hadoopConfiguration())
                    obj = fs.listStatus(Path(path))
                except:
                    logging.error(f'An error occurred accessing {url}{path}')
                    obj = []

                lst = []
                for i in range(len(obj)):
                    if obj[i].isFile():
                        obj_type = 'FILE'
                    elif obj[i].isDirectory():
                        obj_type = 'DIRECTORY'
                    else:
                        obj_type = 'UNDEFINED'

                    obj_name = obj[i].getPath().getName()
                    lst += [(obj_name, obj_type)]

                if lst:
                    df = self.context.createDataFrame(lst, ['name', 'type'])
                else:
                    df = df_empty

                return df

            elif md['format'] == 'jdbc':
                # remove options from database, if any

                database = md["database"].split('?')[0]
                schema = md['schema']
                table = md['table']

                if database and table:
                    try:
                        obj = self.context.read \
                        .format('jdbc') \
                        .option('url', md['url']) \
                        .option("dbtable", table) \
                        .option("driver", md['driver']) \
                        .option("user", md['user']) \
                        .option('password', md['password']) \
                        .load()
                        info = [(i.name, i.dataType.simpleString())
                                for i in obj.schema]
                    except:
                        info = []

                    if info:
                        return self.context.createDataFrame(
                            info, ['name', 'type'])

                if md['service'] == 'mssql':
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM INFORMATION_SCHEMA.TABLES
                              WHERE table_schema='{schema}'
                            ) as query
                            """
                elif md['service'] == 'oracle':
                    query = f"""
                            ( SELECT table_name, table_type
                             FROM all_tables
                             WHERE table_schema='{schema}'
                            ) as query
                            """
                elif md['service'] == 'mysql':
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM information_schema.tables
                              WHERE table_schema='{schema}'
                            ) as query
                            """
                elif md['service'] == 'postgres':
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM information_schema.tables
                              WHERE table_schema = '{schema}'
                            ) as query
                            """
                else:
                    # vanilla query ... for other databases
                    query = f"""
                                ( SELECT table_name, table_type
                                  FROM information_schema.tables'
                                ) as query
                                """

                obj = self.context.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", query) \
                    .option("driver", md['driver']) \
                    .option("user", md['user']) \
                    .option('password', md['password']) \
                    .load()

                # load the data from jdbc
                lst = []
                for x in obj.select('TABLE_NAME', 'TABLE_TYPE').collect():
                    lst.append((x.TABLE_NAME, x.TABLE_TYPE))

                if lst:
                    df = self.context.createDataFrame(lst, ['name', 'type'])
                else:
                    df = df_empty

                return df

            else:
                logging.error({
                    'md':
                    md,
                    'error_msg':
                    f'List resource on service "{md["service"]}" not implemented'
                })
                return df_empty
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return df_empty
Пример #12
0
import sys
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('reddit averages').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
assert spark.version >= '2.3'  # make sure we have Spark 2.3+

comments_schema = types.StructType([
    types.StructField('archived', types.BooleanType()),
    types.StructField('author', types.StringType()),
    types.StructField('author_flair_css_class', types.StringType()),
    types.StructField('author_flair_text', types.StringType()),
    types.StructField('body', types.StringType()),
    types.StructField('controversiality', types.LongType()),
    types.StructField('created_utc', types.StringType()),
    types.StructField('distinguished', types.StringType()),
    types.StructField('downs', types.LongType()),
    types.StructField('edited', types.StringType()),
    types.StructField('gilded', types.LongType()),
    types.StructField('id', types.StringType()),
    types.StructField('link_id', types.StringType()),
    types.StructField('name', types.StringType()),
    types.StructField('parent_id', types.StringType()),
    types.StructField('retrieved_on', types.LongType()),
    types.StructField('score', types.LongType()),
    types.StructField('score_hidden', types.BooleanType()),
    types.StructField('subreddit', types.StringType()),
    types.StructField('subreddit_id', types.StringType()),
    types.StructField('ups', types.LongType()),
Пример #13
0
from pyspark.ml.feature import SQLTransformer, VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession, functions, types
import sys
import datetime
import numpy as np
import elevation_grid as eg
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+

spark = SparkSession.builder.appName('example code').getOrCreate()
assert spark.version >= '2.4'  # make sure we have Spark 2.4+
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext

tmax_schema = types.StructType([
    types.StructField('station', types.StringType()),
    types.StructField('date', types.DateType()),
    types.StructField('latitude', types.FloatType()),
    types.StructField('longitude', types.FloatType()),
    types.StructField('elevation', types.FloatType()),
    types.StructField('tmax', types.FloatType()),
])

DATASET = '/courses/732/tmax-test'
MODEL = 'weather-model'
DATE = datetime.date(2020, 2, 1)


def change():
    data = spark.read.csv(DATASET, schema=tmax_schema)
    data.createOrReplaceTempView('d')
Пример #14
0
                    .getOrCreate()

spark_session.sparkContext.addFile('parse_tool.py')

from parse_tool import parse_logs

# User logs collection
user_logs = spark_session.sparkContext.textFile("/data/access_logs/big_log/")

parsed_logs = user_logs.map(parse_logs) \
                       .map(lambda parse_res : [
                          parse_res[0] + '_' + parse_res[7],
                          parse_res[3]
                       ])

schema = tp.StructType().add("user_id", tp.StringType())\
                        .add("request_id", tp.StringType())

user_log_df = spark_session.createDataFrame(parsed_logs, schema)

user_log_df_1 = user_log_df.alias("df_1")
user_log_df_2 = user_log_df.alias("df_2")

is_request_to_id = fn.udf(lambda line: line.startswith('/id'),
                          tp.BooleanType())

top_5 = user_log_df_1.groupBy(user_log_df.user_id) \
                     .count() \
                     .orderBy(fn.desc("count")) \
                     .limit(100) \
                     .join(user_log_df_2, user_log_df_1.user_id == user_log_df_2.user_id) \
Пример #15
0
def infant_survival_mllib():
	spark = SparkSession.builder.appName('infant-survival-mllib').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.StringType()),
		('BIRTH_YEAR', types.IntegerType()),
		('BIRTH_MONTH', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('MOTHER_RACE_6CODE', types.StringType()),
		('MOTHER_EDUCATION', types.StringType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('FATHER_EDUCATION', types.StringType()),
		('MONTH_PRECARE_RECODE', types.StringType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_BMI_RECODE', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.StringType()),
		('DIABETES_GEST', types.StringType()),
		('HYP_TENS_PRE', types.StringType()),
		('HYP_TENS_GEST', types.StringType()),
		('PREV_BIRTH_PRETERM', types.StringType()),
		('NO_RISK', types.StringType()),
		('NO_INFECTIONS_REPORTED', types.StringType()),
		('LABOR_IND', types.StringType()),
		('LABOR_AUGM', types.StringType()),
		('STEROIDS', types.StringType()),
		('ANTIBIOTICS', types.StringType()),
		('ANESTHESIA', types.StringType()),
		('DELIV_METHOD_RECODE_COMB', types.StringType()),
		('ATTENDANT_BIRTH', types.StringType()),
		('APGAR_5', types.IntegerType()),
		('APGAR_5_RECODE', types.StringType()),
		('APGAR_10', types.IntegerType()),
		('APGAR_10_RECODE', types.StringType()),
		('INFANT_SEX', types.StringType()),
		('OBSTETRIC_GESTATION_WEEKS', types.IntegerType()),
		('INFANT_WEIGHT_GRAMS', types.IntegerType()),
		('INFANT_ASSIST_VENTI', types.StringType()),
		('INFANT_ASSIST_VENTI_6HRS', types.StringType()),
		('INFANT_NICU_ADMISSION', types.StringType()),
		('INFANT_SURFACANT', types.StringType()),
		('INFANT_ANTIBIOTICS', types.StringType()),
		('INFANT_SEIZURES', types.StringType()),
		('INFANT_NO_ABNORMALITIES', types.StringType()),
		('INFANT_ANCEPHALY', types.StringType()),
		('INFANT_MENINGOMYELOCELE', types.StringType()),
		('INFANT_LIMB_REDUCTION', types.StringType()),
		('INFANT_DOWN_SYNDROME', types.StringType()),
		('INFANT_SUSPECTED_CHROMOSOMAL_DISORDER', types.StringType()),
		('INFANT_NO_CONGENITAL_ANOMALIES_CHECKED', types.StringType()),
		('INFANT_BREASTFED', types.StringType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_train.csv.gz', header=True, schema=schema)

	selected_features = [
		'INFANT_ALIVE_AT_REPORT', 
		'BIRTH_PLACE', 
		'MOTHER_AGE_YEARS', 
		'FATHER_COMBINED_AGE', 
		'CIG_BEFORE', 
		'CIG_1_TRI', 
		'CIG_2_TRI', 
		'CIG_3_TRI', 
		'MOTHER_HEIGHT_IN', 
		'MOTHER_PRE_WEIGHT', 
		'MOTHER_DELIVERY_WEIGHT', 
		'MOTHER_WEIGHT_GAIN', 
		'DIABETES_PRE', 
		'DIABETES_GEST', 
		'HYP_TENS_PRE', 
		'HYP_TENS_GEST', 
		'PREV_BIRTH_PRETERM'
	]
	births_trimmed = births.select(selected_features)

	recode_dictionary = {'YNU': {'Y': 1, 'N': 0, 'U': 0}}  # Yes/No/Unknown.

	def recode(col, key):
		return recode_dictionary[key][col]

	def correct_cig(feat):
		return func.when(func.col(feat) != 99, func.col(feat)).otherwise(0)

	rec_integer = func.udf(recode, types.IntegerType())

	births_transformed = births_trimmed \
		.withColumn('CIG_BEFORE', correct_cig('CIG_BEFORE')) \
		.withColumn('CIG_1_TRI', correct_cig('CIG_1_TRI')) \
		.withColumn('CIG_2_TRI', correct_cig('CIG_2_TRI')) \
		.withColumn('CIG_3_TRI', correct_cig('CIG_3_TRI'))

	cols = [(col.name, col.dataType) for col in births_trimmed.schema]
	YNU_cols = []
	for i, s in enumerate(cols):
		if s[1] == types.StringType():
			dis = births.select(s[0]).distinct().rdd.map(lambda row: row[0]).collect()
			if 'Y' in dis:
				YNU_cols.append(s[0])

	births.select(['INFANT_NICU_ADMISSION', 
		rec_integer('INFANT_NICU_ADMISSION', func.lit('YNU')).alias('INFANT_NICU_ADMISSION_RECODE')
	]).take(5)

	exprs_YNU = [rec_integer(x, func.lit('YNU')).alias(x) if x in YNU_cols else x for x in births_transformed.columns]
	births_transformed = births_transformed.select(exprs_YNU)
	births_transformed.select(YNU_cols[-5:]).show(5)

	# Calculate the descriptive statistics of the numeric features.
	numeric_cols = ['MOTHER_AGE_YEARS','FATHER_COMBINED_AGE',
		'CIG_BEFORE','CIG_1_TRI','CIG_2_TRI','CIG_3_TRI',
		'MOTHER_HEIGHT_IN','MOTHER_PRE_WEIGHT',
		'MOTHER_DELIVERY_WEIGHT','MOTHER_WEIGHT_GAIN'
	]
	numeric_rdd = births_transformed.select(numeric_cols).rdd.map(lambda row: [e for e in row])

	mllib_stats = mllib_stat.Statistics.colStats(numeric_rdd)

	for col, m, v in zip(numeric_cols,  mllib_stats.mean(), mllib_stats.variance()):
		print('{0}: \t{1:.2f} \t {2:.2f}'.format(col, m, np.sqrt(v)))

	# Calculate frequencies for the categorical variables.
	categorical_cols = [e for e in births_transformed.columns if e not in numeric_cols]
	categorical_rdd = births_transformed.select(categorical_cols).rdd.map(lambda row: [e for e in row])

	for i, col in enumerate(categorical_cols):
		agg = categorical_rdd.groupBy(lambda row: row[i]).map(lambda row: (row[0], len(row[1])))
		print(col, sorted(agg.collect(), key=lambda el: el[1], reverse=True))

	# Correlation.
	corrs = mllib_stat.Statistics.corr(numeric_rdd)

	for i, el in enumerate(corrs > 0.5):
		correlated = [(numeric_cols[j], corrs[i][j]) for j, e in enumerate(el) if e == 1.0 and j != i]
		if len(correlated) > 0:
			for e in correlated:
				print('{0}-to-{1}: {2:.2f}'.format(numeric_cols[i], e[0], e[1]))

	# Drop most of highly correlated features.
	features_to_keep = [
		'INFANT_ALIVE_AT_REPORT', 
		'BIRTH_PLACE', 
		'MOTHER_AGE_YEARS', 
		'FATHER_COMBINED_AGE', 
		'CIG_1_TRI', 
		'MOTHER_HEIGHT_IN', 
		'MOTHER_PRE_WEIGHT', 
		'DIABETES_PRE', 
		'DIABETES_GEST', 
		'HYP_TENS_PRE', 
		'HYP_TENS_GEST', 
		'PREV_BIRTH_PRETERM'
	]
	births_transformed = births_transformed.select([e for e in features_to_keep])

	#--------------------
	# Statistical testing.

	# Run a Chi-square test to determine if there are significant differences for categorical variables.
	for cat in categorical_cols[1:]:
	    agg = births_transformed.groupby('INFANT_ALIVE_AT_REPORT').pivot(cat).count()
	    agg_rdd = agg.rdd.map(lambda row: (row[1:])).flatMap(lambda row: [0 if e == None else e for e in row]).collect()

	    row_length = len(agg.collect()[0]) - 1
	    agg = mllib_linalg.Matrices.dense(row_length, 2, agg_rdd)

	    test = mllib_stat.Statistics.chiSqTest(agg)
	    print(cat, round(test.pValue, 4))

	#--------------------
	# Machine learning.

	# Create an RDD of LabeledPoints.
	hashing = mllib_feature.HashingTF(7)

	births_hashed = births_transformed \
		.rdd \
		.map(lambda row: [list(hashing.transform(row[1]).toArray()) if col == 'BIRTH_PLACE' else row[i] for i, col in enumerate(features_to_keep)]) \
		.map(lambda row: [[e] if type(e) == int else e for e in row]) \
		.map(lambda row: [item for sublist in row for item in sublist]) \
		.map(lambda row: mllib_regression.LabeledPoint(row[0], mllib_linalg.Vectors.dense(row[1:])))

	# Split into training and testing.
	births_train, births_test = births_hashed.randomSplit([0.6, 0.4])

	# Estimate a logistic regression model using a stochastic gradient descent (SGD) algorithm.
	LR_Model = LogisticRegressionWithLBFGS.train(births_train, iterations=10)

	# Predict the classes for our testing set.
	LR_results = (
		births_test.map(lambda row: row.label).zip(LR_Model.predict(births_test.map(lambda row: row.features)))
	).map(lambda row: (row[0], row[1] * 1.0))

	# Check how well or how bad our model performed.
	print('********************************************000')
	LR_evaluation = mllib_eval.BinaryClassificationMetrics(LR_results)
	print('********************************************001')
	print('Area under PR: {0:.2f}'.format(LR_evaluation.areaUnderPR))
	print('********************************************002')
	print('Area under ROC: {0:.2f}'.format(LR_evaluation.areaUnderROC))
	print('********************************************003')
	LR_evaluation.unpersist()

	# Select the most predictable features using a Chi-Square selector.
	selector = mllib_feature.ChiSqSelector(4).fit(births_train)

	topFeatures_train = (
		births_train.map(lambda row: row.label).zip(selector.transform(births_train.map(lambda row: row.features)))
	).map(lambda row: mllib_regression.LabeledPoint(row[0], row[1]))

	topFeatures_test = (
		births_test.map(lambda row: row.label).zip(selector.transform(births_test.map(lambda row: row.features)))
	).map(lambda row: mllib_regression.LabeledPoint(row[0], row[1]))

	# Build a random forest model.
	RF_model = RandomForest.trainClassifier(data=topFeatures_train, numClasses=2, categoricalFeaturesInfo={}, numTrees=6, featureSubsetStrategy='all', seed=666)

	RF_results = (topFeatures_test.map(lambda row: row.label).zip(RF_model.predict(topFeatures_test.map(lambda row: row.features))))

	RF_evaluation = mllib_eval.BinaryClassificationMetrics(RF_results)

	print('Area under PR: {0:.2f}'.format(RF_evaluation.areaUnderPR))
	print('Area under ROC: {0:.2f}'.format(RF_evaluation.areaUnderROC))
	RF_evaluation.unpersist()

	# See how the logistic regression would perform with reduced number of features.
	LR_Model_2 = LogisticRegressionWithLBFGS.train(topFeatures_train, iterations=10)

	LR_results_2 = (
		topFeatures_test.map(lambda row: row.label).zip(LR_Model_2.predict(topFeatures_test.map(lambda row: row.features)))
	).map(lambda row: (row[0], row[1] * 1.0))

	LR_evaluation_2 = mllib_eval.BinaryClassificationMetrics(LR_results_2)

	print('Area under PR: {0:.2f}'.format(LR_evaluation_2.areaUnderPR))
	print('Area under ROC: {0:.2f}'.format(LR_evaluation_2.areaUnderROC))
	LR_evaluation_2.unpersist()
Пример #16
0
def main(keyspace, table):
    # create dataframes for order, lineitem and part
    df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='yelp_business', keyspace=keyspace).load()
    df.cache()
    # save as Allbusinesses table
    city_review = df.select('city',
                            'review_count').groupby('city').sum().orderBy(
                                'sum(review_count)',
                                ascending=False).withColumnRenamed(
                                    'sum(review_count)', 'ttl_reviews/City')
    # set up search grid around regions in Las Vegas
    # the final city we grab  Las Vegas, North Las Vegas, Henderson, Boulder City, las vegas
    lat, lon = 36.181271, -115.134132
    lat_range = 0.015
    lon_range = 0.015
    #DF = df.filter((df.city=='Las Vegas') | (df.city=='North Las Vegas')).select('latitude', 'longitude').orderBy('latitude')
    #DF.show()
    # save as champDF table
    DF = df.filter('latitude between {} and {}'.format(
        lat - lat_range,
        lat + lat_range)).filter('longitude between {} and {}'.format(
            lon - lon_range, lon + lon_range)).cache()
    las_vegas_df = DF.select('city',
                             'review_count').groupby('city').sum().orderBy(
                                 'sum(review_count)',
                                 ascending=False).withColumnRenamed(
                                     'sum(review_count)',
                                     'ttl_reviews/las_vegas')
    DF.cache()
    #Split categories
    cate_rdd = DF.select('categories', 'business_id').rdd.map(lambda x: x[:])
    # convert into a tuple of each category with one business_id
    categories = cate_rdd.flatMap(cate_tuple)
    # schemaString = "category business_id"
    # fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
    # schema = StructType(fields)
    observation_schema = types.StructType([
        types.StructField('category', types.StringType(), True),
        types.StructField('business_id', types.StringType(), True)
    ])
    # save as categoryBusiness table
    categoryDF = spark.createDataFrame(categories, observation_schema)
    categoryDF = categoryDF.withColumn("cate_count", lit(1))
    categoryDF.cache()
    categoryDF.createOrReplaceTempView("cate_restaurant")
    # Looking at all of the categories listed by frequency (each business can have multiple)
    df_cate_count = categoryDF.select(
        'category', 'cate_count').groupby('category').sum().orderBy(
            'sum(cate_count)',
            ascending=False).withColumnRenamed('sum(cate_count)', 'count')
    # filter business with categories as food or restaurants
    food_rest_df = spark.sql(
        "SELECT count(*) AS num_category_restaurants FROM cate_restaurant WHERE lower(category) LIKE '%food%' OR lower(category) LIKE '%restaurant%'"
    )
    # save table as foodbusiness
    food_rest_business = spark.sql(
        "SELECT count(category) AS num_category_restaurants, business_id FROM cate_restaurant WHERE lower(category) LIKE '%food%' OR lower(category) LIKE '%restaurant%' GROUP BY business_id"
    )
    # saved as businessFoodOnly table
    business_food_rest_df = DF.join(food_rest_business, "business_id", "right")
    business_food_rest_df.groupby('state').count()
    # convert each attributes with business_id
    attri = business_food_rest_df.select('attributes',
                                         'business_id').rdd.map(lambda x: x[:])
    attri_restaurant = attri.flatMap(lambda x: att_time_split(x))
    schema_1 = types.StructType([
        types.StructField('attributes', types.StringType(), True),
        types.StructField('business_id', types.StringType(), True)
    ])
    attri_df = spark.createDataFrame(attri_restaurant, schema_1)
    # Extract dictionaries from attributes column
    attri_df2 = attri_df.rdd.map(lambda x: x[:]).flatMap(
        lambda x: dict_split(x))
    schema_2 = types.StructType([
        types.StructField('attribute', types.StringType(), True),
        types.StructField('attribute_value', types.StringType(), True),
        types.StructField('business_id', types.StringType(), True)
    ])
    attri_df3 = spark.createDataFrame(attri_df2, schema_2)
    # saved as attributeFinal
    hours_rdd = business_food_rest_df.select('hours', 'business_id').rdd.map(
        lambda x: x[:]).flatMap(lambda x: att_time_split(x))
    schema_3 = types.StructType([
        types.StructField('hours', types.StringType(), True),
        types.StructField('business_id', types.StringType(), True)
    ])
    # saves as hoursBusiness table
    hours_df = spark.createDataFrame(hours_rdd, schema_3)
    hours_df1 = hours_df.groupby('hours').count().orderBy('count',
                                                          ascending=False)
    # clean hours column
    hours_rdd1 = hours_df.rdd.map(lambda x: x[:]).flatMap(
        lambda x: hours_split(x))
    schema_5 = types.StructType([
        types.StructField('day', types.StringType(), True),
        types.StructField('opening_hour', types.FloatType(), True),
        types.StructField('closing_hour', types.FloatType(), True),
        types.StructField('business_id', types.StringType(), True)
    ])
    # saved as openCloseBusiness table
    hours_df2 = spark.createDataFrame(hours_rdd1, schema_5)
    # most popular opening hours
    popular_hour_df = hours_df.groupby('day', 'opening_hour').count().orderBy(
        'count', ascending=False)
    # Check-in dataset cleaning (saved as checkinAll table)
    df_checkin = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='yelp_checkin', keyspace=keyspace).load()
    df_checkin.cache()
    checkin_rdd = df_checkin.select('time', 'business_id').rdd.map(
        lambda x: x[:]).flatMap(lambda x: att_time_split(x))
    schema_5 = types.StructType([
        types.StructField('checkin', types.StringType(), True),
        types.StructField('business_id', types.StringType(), True)
    ])
    # saved as checkinCount table
    checkin_df = spark.createDataFrame(checkin_rdd, schema_5)
    # each business separated checkin hours count
    checkin_count_df = checkin_df.groupby('business_id').count().orderBy(
        'business_id',
        ascending=False).withColumnRenamed('count', 'num_checkin')
    # merge num of checkins to business df (saved as cleanBusiness table) 190 restaurants in total
    cleanBusinessDF = business_food_rest_df.join(
        checkin_count_df, 'business_id',
        'left').drop('hours', 'categories', 'attributes', 'type', 'is_open')
    df_review = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='yelp_review', keyspace=keyspace).load()
    review_lasvegas_DF = df_review.join(
        cleanBusinessDF, 'business_id',
        'right').drop(cleanBusinessDF['stars']).drop(
            'address', 'latitude', 'longitude', 'postal_code', 'review_count',
            'state', 'num_category_restaurants', 'num_checkin')
    if table == 'yelp_business_lasvegas':
        cleanBusinessDF.repartition(300).write.format(
            "org.apache.spark.sql.cassandra").options(
                table=table, keyspace=keyspace).save()
    elif table == 'yelp_review_lasvegas':
        review_lasvegas_DF.repartition(300).write.format(
            "org.apache.spark.sql.cassandra").options(
                table=table, keyspace=keyspace).save()
Пример #17
0
    def get_cancellations(self):
        """get cancellation data, lowest dimension is at campaign level, so only
        the campaign id is parsed from label. The hotel id is not parsed because
        we are interested in which hotels that got booked in the end get cancelled,
        not those hotels that get clicked (parsing from the label for hotel id correspond
        to notion of last click attribution)

        returns: spark dataframe
        """
        def extract_aff_label(aff_name, info_type):
            """ function copied from the account_1stats, this is not ideal as this is a function
            embedded in another function
            # udf to obtain the cc, device and placement from the affiliate name
            # note that the cc from this table contains options like AOW and ROW
            # very likely there's no match here
            """
            try:
                if info_type == "cc":
                    return aff_name.split(".")[0].split("_")[1]
                elif info_type == "placement":
                    placement = aff_name.split(".")[1]
                    if placement == "LU":
                        return "localuniversal"
                    elif placement == "MR":
                        return "mapresults"
                    else:
                        return None
                elif info_type == "device":
                    device = aff_name.split(".")[2]
                    if device == "PC":
                        return "desktop"
                    elif device in ("Mobile", "Tablet"):
                        return device.lower()
                    else:
                        return None
                else:
                    return None
            except:
                return None

        spark.udf.register("extract_aff_label",
                           extract_aff_label,
                           returnType=t.StringType())

        # return everything as StringType first,
        # will correct for this later on
        def extract_res_label(label):
            """function copied from the account_1stats, this is not ideal as this is a function
            embedded in another function, this is a udf to extract relevant information from label
            of reservations
            """
            temp = label.split("_")
            info_dict = {}
            for x in temp:
                data = x.split("-")
                if len(data) == 2:
                    info_dict[data[0]] = data[1]
                else:
                    if "mapresults" in x.lower():
                        info_dict["placement"] = "mapresults"
                    elif "localuniversal" in x.lower():
                        info_dict["placement"] = "localuniversal"

                    if "hotel-" in x.lower():
                        try:
                            info_dict["hotel_id"] = x.split("hotel-")[1]
                        except:
                            info_dict["hotel_id"] = None

            return info_dict

        spark.udf.register("extract_res_label",
                           extract_res_label,
                           returnType=t.MapType(t.StringType(),
                                                t.StringType()))

        aff_id = spark.table(self.affiliate_table)\
                     .where("partner_id = 423463")\
                     .selectExpr("id as affiliate_id"
                     ,"extract_aff_label(name,'cc') aff_cc"
                     ,"extract_aff_label(name,'placement') aff_placement"
                     ,"extract_aff_label(name,'device') aff_device")

        # get cancelled reservation between start and end_date
        cancelled_reservations = (
            spark.table(self.reservation_table).withColumn(
                "date_cancelled", f.expr("to_date(date_cancelled)")
            ).withColumnRenamed("id", "hotelreservation_id").where(
                "date_cancelled between '{start_date}' and '{end_date}'".
                format(start_date=self.start_date,
                       end_date=self.end_date)).join(
                           spark.table("fpa.device_class_lookup").select(
                               "hotelreservation_id", "device_class"),
                           on="hotelreservation_id",
                           how="inner").
            where("status not in ('fraudulent', 'test', 'unknown')").join(
                f.broadcast(aff_id), on="affiliate_id",
                how="inner").selectExpr("date_cancelled", "label",
                                        "upper(booker_cc1) booker_cc1",
                                        "hotelreservation_id", "hotel_id",
                                        "aff_cc", "roomnights",
                                        "commission_amount_euro"))

        # grab information from the label
        cancelled_reservations_label = (cancelled_reservations.withColumn(
            "label_map", f.expr("extract_res_label(label)")
        ).withColumn(
            "label_cid",
            f.expr(
                "cast(coalesce(label_map['cid'],get_cid(label_map['ucc'])) as int)"
            )).drop("label_map").cache())

        # only keep the coalescing of campaign id and select only relevant columns
        can_res_cleaned = (cancelled_reservations_label.selectExpr(
            "hotelreservation_id", "to_date(date_cancelled) yyyy_mm_dd",
            "hotel_id",
            "coalesce(label_cid,get_cid(aff_cc),get_cid(booker_cc1),66) campaign_id",
            "commission_amount_euro cancelled_commission",
            "roomnights cancelled_roomnights", "1 cancellations"))

        # filter for relevant campaigns
        account_1_campaign = self.get_id_pos()
        can_res_cleaned = can_res_cleaned.join(account_1_campaign,
                                               on="campaign_id",
                                               how="inner")

        cancellations_agg = can_res_cleaned.groupBy(*self.agg_on)\
                    .agg(f.sum("cancelled_commission").alias("cancelled_commission")
                        ,f.sum("cancelled_roomnights").alias("cancelled_roomnights")
                        ,f.sum("cancellations").alias("cancellations"))

        return cancellations_agg
Пример #18
0
import sys
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
from pyspark.sql import SparkSession, functions, types
spark = SparkSession.builder.appName('temp_range_sql').getOrCreate()
assert spark.version >= '2.4'  # make sure we have Spark 2.4+

observation_schema = types.StructType([
    types.StructField('station', types.StringType()),
    types.StructField('date', types.StringType()),
    types.StructField('observation', types.StringType()),
    types.StructField('value', types.IntegerType()),
    types.StructField('mflag', types.StringType()),
    types.StructField('qflag', types.StringType()),
    types.StructField('sflag', types.StringType()),
    types.StructField('obstime', types.StringType()),
])


def main(inputs, output):

    weather = spark.read.csv(inputs, schema=observation_schema)
    weather.createOrReplaceTempView('weather')

    filter_weather = spark.sql(
        "SELECT date, station, observation, value FROM weather WHERE qflag IS NULL"
    )
    filter_weather.createOrReplaceTempView('filter_weather')

    max_weather = spark.sql(
        "SELECT * FROM filter_weather WHERE observation = 'TMAX' ")
    max_weather.createOrReplaceTempView('max_weather')
Пример #19
0
def as_spark_type(tpe: Union[str, type, Dtype],
                  *,
                  raise_error: bool = True,
                  prefer_timestamp_ntz: bool = False) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
    if sys.version_info >= (3, 8) and LooseVersion(
            np.__version__) >= LooseVersion("1.21"):
        if (hasattr(tpe, "__origin__")
                and tpe.__origin__ is np.ndarray  # type: ignore[union-attr]
                and hasattr(tpe, "__args__")
                and len(tpe.__args__) > 1  # type: ignore[union-attr]
            ):
            # numpy.typing.NDArray
            return types.ArrayType(
                as_spark_type(
                    tpe.__args__[1].__args__[0],
                    raise_error=raise_error  # type: ignore[union-attr]
                ))

    if isinstance(tpe, np.dtype) and tpe == np.dtype("object"):
        pass
    # ArrayType
    elif tpe in (np.ndarray, ):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(
            tpe.__origin__,
            list  # type: ignore[union-attr]
    ):
        element_type = as_spark_type(
            tpe.__args__[0],
            raise_error=raise_error  # type: ignore[union-attr]
        )
        if element_type is None:
            return None
        return types.ArrayType(element_type)
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool_, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date, ):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal, ):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float_, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int64, "int", "int64", "long"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType or TimestampNTZType if timezone is not specified.
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampNTZType(
        ) if prefer_timestamp_ntz else types.TimestampType()

    # categorical types
    elif isinstance(tpe, CategoricalDtype) or (isinstance(tpe, str)
                                               and type == "category"):
        return types.LongType()

    # extension types
    elif extension_dtypes_available:
        # IntegralType
        if isinstance(tpe, Int8Dtype) or (isinstance(tpe, str)
                                          and tpe == "Int8"):
            return types.ByteType()
        elif isinstance(tpe, Int16Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int16"):
            return types.ShortType()
        elif isinstance(tpe, Int32Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int32"):
            return types.IntegerType()
        elif isinstance(tpe, Int64Dtype) or (isinstance(tpe, str)
                                             and tpe == "Int64"):
            return types.LongType()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(tpe, BooleanDtype) or (isinstance(tpe, str)
                                                 and tpe == "boolean"):
                return types.BooleanType()
            # StringType
            elif isinstance(tpe, StringDtype) or (isinstance(tpe, str)
                                                  and tpe == "string"):
                return types.StringType()

        if extension_float_dtypes_available:
            # FractionalType
            if isinstance(tpe, Float32Dtype) or (isinstance(tpe, str)
                                                 and tpe == "Float32"):
                return types.FloatType()
            elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str)
                                                   and tpe == "Float64"):
                return types.DoubleType()

    if raise_error:
        raise TypeError("Type %s was not understood." % tpe)
    else:
        return None
Пример #20
0
        print("Usage: spark-submit tweetconsumer.py <hostname> <port> <topic>")
        print("eg. spark-submit  --packages org.apache.spark:spark-sql\
        -kafka-0-10_2.11:2.4.5 tweetconsumer.py  localhost 9092 twitter")
        sys.exit(1)

    host = sys.argv[1]
    port = sys.argv[2]
    topic = sys.argv[3]
    connect_string = host + ":" + port

    spark = SparkSession.builder.appName("TweetConsumer").getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")

    schema = t.StructType() \
        .add("id", t.LongType()) \
        .add("full_text", t.StringType()) \
        .add("len", t.IntegerType()) \
        .add("in_reply_to_status_id", t.StringType()) \
        .add("date", t.StringType()) \
        .add("source", t.StringType()) \
        .add("likes", t.IntegerType()) \
        .add("retweet", t.IntegerType()) \
        .add("sent_by", t.StringType()) \
        .add("friend_of", t.StringType()) \
        .add("hash_tag", t.StringType()) \


    tweetsRawDF = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
Пример #21
0
def read_sas_csv(path_raw_data, spark):
    """
    This function read sas file and csv, with functions in read_file1.py
    return dataframe
    """
    try:
        # df_immigration
        #print('_____df_imigration____')
        cols = [
            'cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'i94port',
            'i94mode', 'i94addr', 'i94bir', 'i94visa', 'dtadfile', 'gender',
            'airline', 'visatype'
        ]
        file = '18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'
        # todo :refaire avec S3 et tous les fichiers (get_path_sas_folder parquet file)
        df_immigration = read_sas(spark, path_raw_data, file, cols)

        # df_temperature
        #print('_____df_temperature____')
        cols = ['AverageTemperature', 'City', 'Country']
        file = 'GlobalLandTemperaturesByCity.csv'
        delimiter = ','
        df_temperature = read_csv(spark, path_raw_data, file, cols, delimiter)

        # df_airport_code
        #print('_____df_airport_code____')
        file = 'airport-codes_csv.csv'
        cols = [
            'ident', 'type', 'name', 'iso_country', 'iso_region',
            'municipality', 'iata_code', 'local_code'
        ]
        delimiter = ','
        df_airport_code = read_csv(spark, path_raw_data, file, cols, delimiter)

        # df_global_airports
        #print('_____df_global_airports____')
        file = 'airports-extended.csv'
        cols = ['airport_ID', 'type', 'name', 'city', 'country', 'iata']
        delimiter = ','
        #header = False
        schema = T.StructType([
            T.StructField('airport_ID', T.IntegerType(), False),
            T.StructField('name', T.StringType(), False),
            T.StructField('city', T.StringType(), False),
            T.StructField('country', T.StringType(), False),
            T.StructField('iata', T.StringType(), False),
            T.StructField('icao', T.StringType(), False),
            T.StructField('latitude', T.StringType(), True),
            T.StructField('longitude', T.StringType(), True),
            T.StructField('altitude', T.IntegerType(), True),
            T.StructField('timezone', T.StringType(), True),
            T.StructField('dst', T.StringType(), True),
            T.StructField('tz_timezone', T.StringType(), True),
            T.StructField('type', T.StringType(), True),
            T.StructField('data_source', T.StringType(), True)
        ])
        df_global_airports = read_csv_global_airports(spark,
                                                      path_raw_data,
                                                      file,
                                                      cols,
                                                      delimiter,
                                                      schema,
                                                      header=False)

        # df_iso_country
        #print('_____df_iso_country____')
        file = 'wikipedia-iso-country-codes.csv'
        #cols = ['Country', 'Alpha_2','Alpha_3', 'Num_code', 'ISO_3166-2']
        #delimiter =','
        #file = 'wikipedia-iso-country-codes.csv'
        df_iso_country = read_csv_iso_country(spark, path_raw_data, file)

        # df_demograph
        #print('_____df_demograph____')
        file = 'us-cities-demographics.csv'
        cols = [
            'City', 'State', 'Median Age', 'Male Population',
            'Female Population', 'Total Population', 'Number of Veterans',
            'Foreign-born', 'Average Household Size', 'State Code', 'Race',
            'Count'
        ]
        delimiter = ';'
        df_demograph = read_csv(spark, path_raw_data, file, cols, delimiter)

        # df_indicator_dev
        #print('_____df_indicator_dev____')
        file = 'WDIData.csv'
        delimiter = ','
        header = False
        cols = [
            'Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
            '2015'
        ]
        df_indicator_dev = read_csv(spark, path_raw_data, file, cols,
                                    delimiter)
        return (df_immigration, df_temperature, df_airport_code,
                df_global_airports, df_iso_country, df_demograph,
                df_indicator_dev)
    except Exception as e:
        print("Unexpected error in read_sas_csv: %s" % e)
        sys.exit()
Пример #22
0
    def extract(self, catalog: Dict[str,
                                    Any]) -> Dict[str, pyspark.sql.DataFrame]:
        """Extracts the files for this job.
        The default implementation uses the inputs dict structure:
        - Files are loaded from the staging directory
        - Tables are loaded from the data-lake repository
        More complex jobs should override this method (e.g. VSAM files)
        Parameters
        ----------
        Returns
        -------
        dict
            a dict where the keys are aliases of the dataframe and the values are DataFrameReaders
        """
        self._logger.info("extract start")
        inputs: Dict[str, pyspark.sql.DataFrame] = {}
        for alias, properties in self.sources.items():
            #
            # load each source
            #
            if properties["type"] == "file":
                # get the entry from the catalog
                if properties["source"] not in catalog:
                    raise ValueError(
                        f"{properties['source']} not found in catalog")

                source: Dict[Any, Any] = catalog[properties["source"]]
                file_locations, all_files = common.utils.get_file_locations(
                    self._env["file_prefix"],
                    source["path"],
                    limit=properties.get("limit", 1),
                    sort=properties.get("sort", 'last_modified'),
                    ascending=properties.get("ascending", True))

                self._logger.debug("loading %s" % file_locations)
                self._processed_files.update(all_files)  # type: ignore
                # custom
                if source["format"] == "custom":
                    continue
                # parse text
                if (source["format"] == "txt"):
                    df_input_segments = []
                    for file_location in file_locations:
                        raw = self._spark.sparkContext.textFile(file_location)
                        footer_line = source.get("skip_footer_rows", 0)
                        if (int(source.get("skip_header_rows", 0)) > 0
                                or int(source.get("skip_footer_rows", 0)) > 0):
                            rdd = raw.zipWithIndex()\
                                .filter(lambda line_index: line_index[1] >= int(source.get("skip_header_rows", 0)))
                            if int(source.get("skip_footer_rows", 0)) > 0:
                                line_count = rdd.count() - 1
                                rdd = rdd\
                                    .filter(lambda line_index: line_index[1] <= line_count-footer_line)
                        rdd = rdd.map(lambda row: row[0])

                        reader = rdd.map(lambda row: row.split(
                            source.get("delimiter", ",")))
                        # parse columns
                        l = []
                        for column, metadata in source.get("columns",
                                                           {}).items():
                            pos = "_" + str(metadata["position"])
                            l.append(pos)

                        reader_cols = reader.toDF().select(l).collect()
                        fields = []
                        for column, coltype in source.get("columns",
                                                          {}).items():
                            if coltype == 'Integer':
                                Field = T.StructField(column, T.IntegerType())
                            elif coltype == 'Date':
                                Field = T.StructField(column, T.DateType())
                            elif coltype == 'Double':
                                Field = T.StructField(column, T.DoubleType())
                            else:
                                Field = T.StructField(column, T.StringType())
                            fields.append(Field)
                            schema = T.StructType(fields)
                        df_input_segment = self._spark.createDataFrame(
                            reader_cols, schema)
                    df_input_segments.append(df_input_segment)
                # parse CSV
                if (source["format"] == "csv"):
                    df_input_segments = []
                    for file_location in file_locations:
                        # TODO enhance conf to support all options
                        reader = self._spark.read\
                            .option("inferSchema", "true")\
                            .option("header", source.get("header",True))\
                            .option("quote", "\"")\
                            .option("escape", "\"")\
                            .option("multiLine", "true")\
                            .option("mode","DROPMALFORMED")\
                            .option("ignoreTrailingWhiteSpace", True)\
                            .option("ignoreLeadingWhiteSpace", True)\
                            .option("delimiter",source.get("delimiter",","))
                        if (int(source.get("skip_header_rows", 0)) > 0
                                or int(source.get("skip_footer_rows", 0)) > 0):
                            df_input_segment = common.parsers.read_csv_remove_header_footer(
                                self._spark, file_location, reader,
                                source.get("skip_header_rows", 0),
                                source.get("skip_footer_rows", 0))
                        else:
                            df_input_segment = reader.csv(file_location)
                        # if we didn't get a header, take it from the metadata and rename the columns
                        if not source.get("header", True):
                            df_input_segment = df_input_segment.toDF(
                                *source.get("columns", {}).keys())
                        # drop records that are completely null
                        df_input_segments.append(df_input_segment)

                elif source["format"] == "cobol":
                    df_input_segments = []
                    for file_location in file_locations:
                        df_input = common.parsers.read_cobol_file(
                            self._spark,
                            file_location,
                            copybook_location=source.get("copybook", None),
                            row_prefix=source.get(
                                "row_prefix", None
                            ),  # if multiline, specify start of new row
                            multiline=source.get(
                                "multiline", False
                            ),  # if has a row prefix this is multiline file
                            record_selector_field=source.get(
                                "record_selector_field", None),
                            record_types=source.get("record_types", None),
                            header_lines=source.get("skip_header_rows", 0),
                            footer_lines=source.get("skip_footer_rows", 0),
                            use_header=source.get("skip_header", False),
                            trim=source.get("trim", False))
                        df_input_segments.append(df_input)

                # reduce the input segments of multiple files to a single dataframe
                df_input = reduce(pyspark.sql.DataFrame.unionAll,
                                  df_input_segments)

                # parse any columns
                for column, metadata in source.get("columns", {}).items():
                    if metadata["type"] == "date":
                        df_input = df_input.withColumn(
                            column,
                            F.to_date(F.col(column).cast("string"),
                                      format=metadata["format"]))

            elif properties["type"] == "table":
                self._logger.debug(
                    f"checking for existing table => {properties['source']}")
                try:
                    # read from the data mart snapshot mirror files
                    df_input = common.utils.read_table_snapshot(
                        table_name=properties["source"],
                        env=self._env,
                        spark=self._spark)
                except FileNotFoundError:
                    # create an empty data frame with dummy schema
                    self._logger.debug(
                        'existing table not found, creating an empty one')

                    # first, we import the job to read its target mappings
                    # for this, we traverse all jobs to find the one that populates this target table
                    found = False
                    for importer, modname, ispkg in pkgutil.walk_packages(
                            path=jobs.__path__,
                            prefix='jobs.'):  # type: ignore
                        if not ispkg:
                            # find the job class
                            job_name = ".".join(
                                modname.split(".")[1:-1]
                            )  # take the middle part of the job package (without prefix or suffix)
                            job_module = importlib.import_module(
                                "jobs.%s.job" % job_name)
                            job_class = getattr(job_module, "Job")
                            job_target_table = getattr(job_class,
                                                       "target_table")

                            if job_target_table == properties["source"]:
                                found = True
                                break

                    if not found:
                        # there is no way to populate this table from source jobs
                        raise ValueError(
                            f"table parquet for {properties['source']} not found and we can't find a job to populate its schema"
                        )

                    job_target_mappings: List[Dict[str, Any]] = getattr(
                        job_class, "target_mappings")
                    # create schema for empty dataframe by reading the target mappings and business keys
                    metadata_columns = [
                        T.StructField("row_strt_dttm", T.TimestampType()),
                        T.StructField("row_stop_dttm", T.TimestampType()),
                        T.StructField("curr_row_flg", T.StringType())
                    ]
                    schema = T.StructType([
                        T.StructField(mapping["target"], T.StringType())
                        for mapping in job_target_mappings
                    ] + metadata_columns)
                    # add primary key
                    for colname, coltype in job_class.primary_key.items():
                        if not colname in map(lambda x: x["target"],
                                              job_target_mappings):
                            schema.add(colname, T.IntegerType())

                    df_input = self._spark.createDataFrame(
                        self._spark.sparkContext.emptyRDD(), schema)

            elif properties["type"] == "dimension":
                # this is an internal 'dimension' table. load as csv from metadata folder
                file_location = pkg_resources.resource_filename(
                    "metadata.dimension_tables",
                    f"{properties['source'].lower()}.csv")
                # load into an RDD
                reader = self._spark.read\
                    .option("inferSchema", "true")\
                    .option("header", True)\
                    .option("quote", "\"")\
                    .option("escape", "\"")\
                    .option("multiLine", "true")\
                    .option("ignoreTrailingWhiteSpace", True)\
                    .option("delimiter",",")
                df_input = reader.csv(file_location)

            inputs[alias] = df_input.alias(alias)

        self._logger.debug("extract done")

        return inputs
Пример #23
0
def get_dataset(
    dataset_type,
    data,
    schemas=None,
    profiler=ColumnsExistProfiler,
    caching=True,
    table_name=None,
    sqlite_db_path=None,
):
    """Utility to create datasets for json-formatted tests.
    """
    df = pd.DataFrame(data)
    if dataset_type == "PandasDataset":
        if schemas and "pandas" in schemas:
            schema = schemas["pandas"]
            pandas_schema = {}
            for (key, value) in schema.items():
                # Note, these are just names used in our internal schemas to build datasets *for internal tests*
                # Further, some changes in pandas internal about how datetimes are created means to support pandas
                # pre- 0.25, we need to explicitly specify when we want timezone.

                # We will use timestamp for timezone-aware (UTC only) dates in our tests
                if value.lower() in ["timestamp", "datetime64[ns, tz]"]:
                    df[key] = pd.to_datetime(df[key], utc=True)
                    continue
                elif value.lower() in [
                        "datetime", "datetime64", "datetime64[ns]"
                ]:
                    df[key] = pd.to_datetime(df[key])
                    continue
                try:
                    type_ = np.dtype(value)
                except TypeError:
                    type_ = getattr(pd.core.dtypes.dtypes, value)
                    # If this raises AttributeError it's okay: it means someone built a bad test
                pandas_schema[key] = type_
            # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()}
            df = df.astype(pandas_schema)
        return PandasDataset(df, profiler=profiler, caching=caching)

    elif dataset_type == "sqlite":
        if not create_engine:
            return None

        if sqlite_db_path is not None:
            engine = create_engine(f"sqlite:////{sqlite_db_path}")
        else:
            engine = create_engine("sqlite://")
        conn = engine.connect()
        # Add the data to the database as a new table

        sql_dtypes = {}
        if (schemas and "sqlite" in schemas
                and isinstance(engine.dialect, sqlitetypes.dialect)):
            schema = schemas["sqlite"]
            sql_dtypes = {
                col: SQLITE_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "postgresql":
        if not create_engine:
            return None

        # Create a new database
        engine = create_engine("postgresql://postgres@localhost/test_ci")
        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and "postgresql" in schemas
                and isinstance(engine.dialect, postgresqltypes.dialect)):
            schema = schemas["postgresql"]
            sql_dtypes = {
                col: POSTGRESQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "mysql":
        if not create_engine:
            return None

        engine = create_engine("mysql+pymysql://root@localhost/test_ci")
        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and "mysql" in schemas
                and isinstance(engine.dialect, mysqltypes.dialect)):
            schema = schemas["mysql"]
            sql_dtypes = {
                col: MYSQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "mssql":
        if not create_engine:
            return None

        engine = create_engine(
            "mssql+pyodbc://sa:ReallyStrongPwd1234%^&*@localhost:1433/test_ci?driver=ODBC Driver 17 for SQL Server&charset=utf8&autocommit=true",
            # echo=True,
        )

        # If "autocommit" is not desired to be on by default, then use the following pattern when explicit "autocommit"
        # is desired (e.g., for temporary tables, "autocommit" is off by default, so the override option may be useful).
        # engine.execute(sa.text(sql_query_string).execution_options(autocommit=True))

        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and dataset_type in schemas
                and isinstance(engine.dialect, mssqltypes.dialect)):
            schema = schemas[dataset_type]
            sql_dtypes = {
                col: MSSQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "SparkDFDataset":
        from pyspark.sql import SparkSession
        import pyspark.sql.types as sparktypes

        SPARK_TYPES = {
            "StringType": sparktypes.StringType,
            "IntegerType": sparktypes.IntegerType,
            "LongType": sparktypes.LongType,
            "DateType": sparktypes.DateType,
            "TimestampType": sparktypes.TimestampType,
            "FloatType": sparktypes.FloatType,
            "DoubleType": sparktypes.DoubleType,
            "BooleanType": sparktypes.BooleanType,
            "DataType": sparktypes.DataType,
            "NullType": sparktypes.NullType,
        }

        spark = SparkSession.builder.getOrCreate()
        # We need to allow null values in some column types that do not support them natively, so we skip
        # use of df in this case.
        data_reshaped = list(
            zip(*[v for _, v in data.items()]))  # create a list of rows
        if schemas and "spark" in schemas:
            schema = schemas["spark"]
            # sometimes first method causes Spark to throw a TypeError
            try:
                spark_schema = sparktypes.StructType([
                    sparktypes.StructField(column,
                                           SPARK_TYPES[schema[column]](), True)
                    for column in schema
                ])
                # We create these every time, which is painful for testing
                # However nuance around null treatment as well as the desire
                # for real datetime support in tests makes this necessary
                data = copy.deepcopy(data)
                if "ts" in data:
                    print(data)
                    print(schema)
                for col in schema:
                    type_ = schema[col]
                    if type_ in ["IntegerType", "LongType"]:
                        # Ints cannot be None...but None can be valid in Spark (as Null)
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(int(val))
                        data[col] = vals
                    elif type_ in ["FloatType", "DoubleType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(float(val))
                        data[col] = vals
                    elif type_ in ["DateType", "TimestampType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(parse(val))
                        data[col] = vals
                # Do this again, now that we have done type conversion using the provided schema
                data_reshaped = list(
                    zip(*[v
                          for _, v in data.items()]))  # create a list of rows
                spark_df = spark.createDataFrame(data_reshaped,
                                                 schema=spark_schema)
            except TypeError:
                string_schema = sparktypes.StructType([
                    sparktypes.StructField(column, sparktypes.StringType())
                    for column in schema
                ])
                spark_df = spark.createDataFrame(data_reshaped, string_schema)
                for c in spark_df.columns:
                    spark_df = spark_df.withColumn(
                        c, spark_df[c].cast(SPARK_TYPES[schema[c]]()))
        elif len(data_reshaped) == 0:
            # if we have an empty dataset and no schema, need to assign an arbitrary type
            columns = list(data.keys())
            spark_schema = sparktypes.StructType([
                sparktypes.StructField(column, sparktypes.StringType())
                for column in columns
            ])
            spark_df = spark.createDataFrame(data_reshaped, spark_schema)
        else:
            # if no schema provided, uses Spark's schema inference
            columns = list(data.keys())
            spark_df = spark.createDataFrame(data_reshaped, columns)
        return SparkDFDataset(spark_df, profiler=profiler, caching=caching)

    else:
        raise ValueError("Unknown dataset_type " + str(dataset_type))
Пример #24
0
def infant_survival_ml():
	spark = SparkSession.builder.appName('infant-survival-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Create a model.
	logistic = ml_classification.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator, logistic])

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Run the pipeline and estimate the model.
	model = pipeline.fit(births_train)
	test_model = model.transform(births_test)

	print(test_model.take(1))

	# Evaluate the performance of the model.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')
	print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'}))

	# Save the Pipeline definition.
	pipelinePath = './infant_oneHotEncoder_Logistic_Pipeline'
	pipeline.write().overwrite().save(pipelinePath)

	# Load the Pipeline definition.
	loadedPipeline = Pipeline.load(pipelinePath)
	loadedPipeline.fit(births_train).transform(births_test).take(1)

	# Save the PipelineModel.
	modelPath = './infant_oneHotEncoder_Logistic_PipelineModel'
	model.write().overwrite().save(modelPath)

	# Load the PipelineModel.
	loadedPipelineModel = PipelineModel.load(modelPath)
	test_reloadedModel = loadedPipelineModel.transform(births_test)

	print(test_reloadedModel.take(1))
Пример #25
0
def get_common_spark_testing_client(data_directory, connect):
    pytest.importorskip('pyspark')
    import pyspark.sql.types as pt
    from pyspark.sql import SparkSession

    spark = SparkSession.builder.getOrCreate()
    _spark_testing_client = connect(spark)
    s = _spark_testing_client._session

    df_functional_alltypes = s.read.csv(
        path=str(data_directory / 'functional_alltypes.csv'),
        schema=pt.StructType(
            [
                pt.StructField('index', pt.IntegerType(), True),
                pt.StructField('Unnamed: 0', pt.IntegerType(), True),
                pt.StructField('id', pt.IntegerType(), True),
                # cast below, Spark can't read 0/1 as bool
                pt.StructField('bool_col', pt.ByteType(), True),
                pt.StructField('tinyint_col', pt.ByteType(), True),
                pt.StructField('smallint_col', pt.ShortType(), True),
                pt.StructField('int_col', pt.IntegerType(), True),
                pt.StructField('bigint_col', pt.LongType(), True),
                pt.StructField('float_col', pt.FloatType(), True),
                pt.StructField('double_col', pt.DoubleType(), True),
                pt.StructField('date_string_col', pt.StringType(), True),
                pt.StructField('string_col', pt.StringType(), True),
                pt.StructField('timestamp_col', pt.TimestampType(), True),
                pt.StructField('year', pt.IntegerType(), True),
                pt.StructField('month', pt.IntegerType(), True),
            ]
        ),
        mode='FAILFAST',
        header=True,
    )
    df_functional_alltypes = df_functional_alltypes.withColumn(
        "bool_col", df_functional_alltypes["bool_col"].cast("boolean")
    )
    df_functional_alltypes.createOrReplaceTempView('functional_alltypes')

    df_batting = s.read.csv(
        path=str(data_directory / 'batting.csv'),
        schema=pt.StructType(
            [
                pt.StructField('playerID', pt.StringType(), True),
                pt.StructField('yearID', pt.IntegerType(), True),
                pt.StructField('stint', pt.IntegerType(), True),
                pt.StructField('teamID', pt.StringType(), True),
                pt.StructField('lgID', pt.StringType(), True),
                pt.StructField('G', pt.IntegerType(), True),
                pt.StructField('AB', pt.DoubleType(), True),
                pt.StructField('R', pt.DoubleType(), True),
                pt.StructField('H', pt.DoubleType(), True),
                pt.StructField('X2B', pt.DoubleType(), True),
                pt.StructField('X3B', pt.DoubleType(), True),
                pt.StructField('HR', pt.DoubleType(), True),
                pt.StructField('RBI', pt.DoubleType(), True),
                pt.StructField('SB', pt.DoubleType(), True),
                pt.StructField('CS', pt.DoubleType(), True),
                pt.StructField('BB', pt.DoubleType(), True),
                pt.StructField('SO', pt.DoubleType(), True),
                pt.StructField('IBB', pt.DoubleType(), True),
                pt.StructField('HBP', pt.DoubleType(), True),
                pt.StructField('SH', pt.DoubleType(), True),
                pt.StructField('SF', pt.DoubleType(), True),
                pt.StructField('GIDP', pt.DoubleType(), True),
            ]
        ),
        header=True,
    )
    df_batting.createOrReplaceTempView('batting')

    df_awards_players = s.read.csv(
        path=str(data_directory / 'awards_players.csv'),
        schema=pt.StructType(
            [
                pt.StructField('playerID', pt.StringType(), True),
                pt.StructField('awardID', pt.StringType(), True),
                pt.StructField('yearID', pt.IntegerType(), True),
                pt.StructField('lgID', pt.StringType(), True),
                pt.StructField('tie', pt.StringType(), True),
                pt.StructField('notes', pt.StringType(), True),
            ]
        ),
        header=True,
    )
    df_awards_players.createOrReplaceTempView('awards_players')

    df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar'])
    df_simple.createOrReplaceTempView('simple')

    df_struct = s.createDataFrame([((1, 2, 'a'),)], ['struct_col'])
    df_struct.createOrReplaceTempView('struct')

    df_nested_types = s.createDataFrame(
        [([1, 2], [[3, 4], [5, 6]], {'a': [[2, 4], [3, 5]]})],
        [
            'list_of_ints',
            'list_of_list_of_ints',
            'map_string_list_of_list_of_ints',
        ],
    )
    df_nested_types.createOrReplaceTempView('nested_types')

    df_complicated = s.createDataFrame(
        [({(1, 3): [[2, 4], [3, 5]]},)], ['map_tuple_list_of_list_of_ints']
    )
    df_complicated.createOrReplaceTempView('complicated')

    df_udf = s.createDataFrame(
        [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')],
        ['a', 'b', 'c', 'key'],
    )
    df_udf.createOrReplaceTempView('udf')

    df_udf_nan = s.createDataFrame(
        pd.DataFrame(
            {
                'a': np.arange(10, dtype=float),
                'b': [3.0, np.NaN] * 5,
                'key': list('ddeefffggh'),
            }
        )
    )
    df_udf_nan.createOrReplaceTempView('udf_nan')

    df_udf_null = s.createDataFrame(
        [
            (float(i), None if i % 2 else 3.0, 'ddeefffggh'[i])
            for i in range(10)
        ],
        ['a', 'b', 'key'],
    )
    df_udf_null.createOrReplaceTempView('udf_null')

    df_udf_random = s.createDataFrame(
        pd.DataFrame(
            {
                'a': np.arange(4, dtype=float).tolist()
                + np.random.rand(3).tolist(),
                'b': np.arange(4, dtype=float).tolist()
                + np.random.rand(3).tolist(),
                'key': list('ddeefff'),
            }
        )
    )
    df_udf_random.createOrReplaceTempView('udf_random')

    return _spark_testing_client
Пример #26
0
def train_validation_splitting_ml():
	spark = SparkSession.builder.appName('train-validation-splitting-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Select only the top five features.
	selector = ml_feature.ChiSqSelector(
		numTopFeatures=5,
		featuresCol=featuresCreator.getOutputCol(),
		outputCol='selectedFeatures',
		labelCol='INFANT_ALIVE_AT_REPORT'
	)

	# Create a purely transforming Pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator, selector])
	data_transformer = pipeline.fit(births_train)

	# Create LogisticRegression and Pipeline.
	logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT', featuresCol='selectedFeatures')
	grid = tune.ParamGridBuilder() \
		.addGrid(logistic.maxIter, [2, 10, 50]) \
		.addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \
		.build()
	# Define a way of comparing the models.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a TrainValidationSplit object.
	tvs = tune.TrainValidationSplit(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator)

	# Fit our data to the model.
	tvsModel = tvs.fit(data_transformer.transform(births_train))
	data_train = data_transformer.transform(births_test)

	# Calculate results.
	results = tvsModel.transform(data_train)
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))
Пример #27
0
spark = SparkSession.builder\
    .master("local[*]")\
    .appName("test.dataframe")\
    .getOrCreate()

# df = spark \
#   .readStream \
#   .format("kafka") \
#   .option("kafka.bootstrap.servers", "10.12.64.205:9092") \
#   .option("subscribe", "greetings") \
#   .load()
# df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

# 第1步,加载数据,默认为字符串类型的单列,列名为value
data = ['[{"a":1,"b":2},{"a":3,"b":4},{"a":5,"b":6},{"a":7,"b":8}]']
df = spark.createDataFrame(data, T.StringType())
df.printSchema()
df.show()

schema = T.ArrayType(
    T.StructType([
        T.StructField("a", T.IntegerType()),
        T.StructField("b", T.IntegerType())
    ]))

# 第2步,将列转为数组类型
df = df.select(F.from_json(df["value"], schema).alias("json"))
df.printSchema()
df.show()

# 第3步,将列转为Struct类型
Пример #28
0
def hyper_parameter_optimization_ml():
	spark = SparkSession.builder.appName('hyper-parameter-optimization-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Create a purely transforming Pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator])
	data_transformer = pipeline.fit(births_train)

	# Specify our model and the list of parameters we want to loop through.
	logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT')
	grid = tune.ParamGridBuilder() \
		.addGrid(logistic.maxIter, [2, 10, 50]) \
		.addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \
		.build()
	# Define a way of comparing the models.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a logic that will do the validation work.
	cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator)

	cvModel = cv.fit(data_transformer.transform(births_train))

	# See if cvModel performed better than our previous model
	data_train = data_transformer.transform(births_test)
	results = cvModel.transform(data_train)

	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))

	# Parameters which the best model has.
	results = [
		([{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric)
		for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics)
	]
	print(sorted(results, key=lambda el: el[1], reverse=True)[0])
Пример #29
0
import requests
import sys
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
from zipfile import ZipFile
from pyspark.sql import SparkSession, functions, types
from io import *
import csv
import pandas as pd
from urllib.request import *
import getCodeSets as codesets
from pyspark.sql.functions import input_file_name

spark = SparkSession.builder.appName('Load Weather Data').getOrCreate()

weather_schema = types.StructType([
    types.StructField('REF_DATE', types.StringType(), True),
    types.StructField('Year', types.StringType(), True),
    types.StructField('Month', types.StringType(), True),
    types.StructField('Mean_Max_Temp', types.StringType(), True),
    types.StructField('Mean_Max_Temp_Flag', types.StringType(), True),
    types.StructField('Mean_Min_Temp', types.StringType(), True),
    types.StructField('Mean_Min_Temp_Flag', types.StringType(), True),
    types.StructField('Mean_Temp', types.StringType(), True),
    types.StructField('Mean_Temp_Flag', types.StringType(), True),
    types.StructField('Extr_Max_Temp', types.StringType(), True),
    types.StructField('Extr_Max_Temp_Flag', types.StringType(), True),
    types.StructField('Extr_Min_Temp', types.StringType(), True),
    types.StructField('Extr_Min_Temp_Flag', types.StringType(), True),
    types.StructField('Total_Rain', types.StringType(), True),
    types.StructField('Total_Rain_Flag', types.StringType(), True),
    types.StructField('Total_Snow', types.StringType(), True),
Пример #30
0
def checkPath(path):
    proc = subprocess.Popen(['hadoop', 'fs', '-test', '-e', path])
    proc.communicate()
    if proc.returncode != 0:
        print('PATH DOES NOT EXIST')
        return False
    else:
        print('PATH EXISTS')
        return True


# Schema


infoSchema = T.StructType([
    T.StructField('vid', T.StringType(), nullable=False),
    T.StructField('liked', T.BooleanType(), nullable=False),
    T.StructField('viewed', T.BooleanType(), nullable=False),
    T.StructField('list', T.BooleanType(), nullable=False)
])

# Updaters


def likeVideo(uid, profile, vid):
    profile_path = f'hdfs:///home/users/{uid}/profiles/{profile}'
    # Check if path exists
    path_exists = checkPath(profile_path)
    if (not path_exists):
        cols = ['vid', 'liked', 'viewed', 'list']
        df = spark.createDataFrame(