Exemplo n.º 1
0
 def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, bytes):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(F.concat(SF.lit(right),
                                          left.spark.column)))
     else:
         raise TypeError(
             "Concatenation can not be applied to %s and the given type." %
             self.pretty_name)
Exemplo n.º 2
0
def extract_field(df):
    text = 'bbbb'
    df = spark.createDataFrame([('A$B', 'a$b$c')], ['k', 's'])
    df.select(f.sha1(f.split(f.col('s'), '\$')).alias('k')).withColumn(
        'k',
        f.when(
            f.col('k') == f"{text}$",
            f.concat(f.lit(f"{text}-"),
                     f.sha1(F.split(F.col("s"), "\$").getItem(0)))).otherwise(
                         F.col('k'))).collect()
Exemplo n.º 3
0
def test_concat_list_with_lit(data_gen):
    array_lit = gen_scalar(data_gen)
    array_lit2 = gen_scalar(data_gen)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: binary_op_df(spark, data_gen).select(
            f.concat(f.col('a'), f.col('b'),
                     f.lit(array_lit).cast(data_gen.data_type))))

    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: binary_op_df(spark, data_gen).select(
            f.concat(
                f.lit(array_lit).cast(data_gen.data_type), f.col('a'),
                f.lit(array_lit2).cast(data_gen.data_type))))

    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: binary_op_df(spark, data_gen).select(
            f.concat(
                f.lit(array_lit).cast(data_gen.data_type),
                f.lit(array_lit2).cast(data_gen.data_type))))
Exemplo n.º 4
0
 def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, str):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(
                 F.concat(SF.lit(right), left.spark.column), field=left._internal.data_fields[0]
             ),
         )
     else:
         raise TypeError("Addition can not be applied to given types.")
Exemplo n.º 5
0
def test_concat():
    gen = mk_str_gen('.{0,5}')
    (s1, s2) = gen_scalars(gen, 2, force_no_nulls=True)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: binary_op_df(spark, gen).select(
            f.concat(f.col('a'), f.col('b')),
            f.concat(f.col('a'), f.col('b'), f.col('a')),
            f.concat(s1, f.col('b')), f.concat(f.col('a'), s2),
            f.concat(f.lit(None).cast('string'), f.col('b')),
            f.concat(f.col('a'),
                     f.lit(None).cast('string')),
            f.concat(f.lit(''), f.col('b')), f.concat(f.col('a'), f.lit(''))))
Exemplo n.º 6
0
def process_writeable_df(joined_df, date_format="yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"):
    """
    Prepares the dataframe for writing to mongo
    :param joined_df:
    :param date_format:
    :return:
    """
    df_with_parsed_dates = parse_dates(joined_df, date_format)
    df_with_id = df_with_parsed_dates.withColumn("id", f.concat(f.col('account_id'), f.lit("_"), f.col("unix_ts")))
    return df_with_id.na.drop()
Exemplo n.º 7
0
def get_hes_tables(hes_type_regex, database):
	"""return spark df of hes tables in database using supplied regex"""
	df_list_hes_tables = spark.sql(f"SHOW TABLES IN {database}")
	df_list_hes_tables = df_list_hes_tables.where(F.col('tableName').rlike(hes_type_regex+'(?=_{0,1}\d{4})'))

	table_path_col = F.concat(F.col('database'), F.lit("."),F.col('tableName'))

	df_list_hes_tables = df_list_hes_tables.select(table_path_col.alias('tablePath'),
												   extract_hes_year(hes_type_regex).alias('tableYear'))

	return df_list_hes_tables.orderBy(F.col('tableYear').asc())#.toPandas()['tablePath'].tolist()
Exemplo n.º 8
0
def pretty_output(context, path):
    temp_for_place = df_from_csv(path)
    pretty_result = temp_for_place.withColumn(
        "temperature_info",
        concat(col("full_address"), lit(", temp (Fahrenheit): "),
               col("temperature_f")),
    )
    pretty_result = pretty_result.select(col("temperature_info"))
    outpath = os.path.join(context.resources.savedir, "pretty_output.csv")
    df_to_csv(pretty_result, outpath)
    return outpath
 def transformSalesData(self, dataframe):
     print("Inside the transformSalesData")
     try:
         dataframe = dataframe.withColumn(
             "UniqueKey",
             sf.concat(sf.col('invoiceno'), sf.lit('_'),
                       sf.col('customerid')))
         dataframe = dataframe.distinct()
         return dataframe
     except Exception as e:
         raise (e, "testRaghu")
Exemplo n.º 10
0
    def load(self, filter=None):
        self.annotate(str(self.location))
        unit_df = super().load(filter=lambda line: line[0] == 'H')

        unit_with_geocode = unit_df.withColumn(
            'geocode',
            concat(col('statefip'), col('county'), col('supdist'),
                   col('enumdist'))).select(
                       ['serial', 'geocode', 'gq', 'gqtype'])
        self.reader.tables['PersonData'].unit_with_geocode = unit_with_geocode
        return unit_df
Exemplo n.º 11
0
 def radd(self, left: T_IndexOps, right: Any) -> IndexOpsLike:
     if isinstance(right, str):
         return cast(
             IndexOpsLike,
             left._with_new_scol(F.concat(
                 F.lit(right), left.spark.column)),  # TODO: dtype?
         )
     else:
         raise TypeError(
             "string addition can only be applied to string series or literals."
         )
def clean_up_df(df):
    # we only want to select rows where the county has the highest population on record
    w = Window.partitionBy('county')
    df = df.withColumn('max_population', f.max('population').over(w))\
        .where(f.col('population') == f.col('max_population'))\
        .drop('max_population')

    # create an id
    df = df.withColumn('doc_id', f.concat('county', 'pollutant'))

    return df
def test_doTransformation(setup):
    print("test_doTransformation Called")

    transformData = TransformSalesData()
    cleanData = transformData.doTransformation(setup)
    actual_result = cleanData.count()

    setup = setup.withColumn("UniqueKey",sf.concat(sf.col('invoiceno'),sf.lit('_'), sf.col('customerid')))
    setup = setup.distinct()
    expected_count = setup.count()
    assert actual_result == expected_count
Exemplo n.º 14
0
def __boss_report(df_0,df_1,df_7):
    """ boss金额报表 """
    ##天环比、周同比连接条件
    condition_0=(F.coalesce(F.col("t_0.channelName"),F.lit("123")) == F.coalesce(F.col("t_1.channelName"),F.lit("123")))
    condition_1=(F.coalesce(F.col("t_0.packageName"),F.lit("123")) == F.coalesce(F.col("t_1.packageName"),F.lit("123"))) 
    condition_2=(F.col("t_0.id_1") == F.col("t_1.id_1"))
    condition_3=(F.coalesce(F.col("t_0.channelName"),F.lit("123")) == F.coalesce(F.col("t_7.channelName"),F.lit("123")))
    condition_4=(F.coalesce(F.col("t_0.packageName"),F.lit("123")) == F.coalesce(F.col("t_7.packageName"),F.lit("123"))) 
    condition_5=(F.col("t_0.id_1") == F.col("t_7.id_1"))
    ##天环比连接条件
    conditions_0_1 = condition_0 & condition_1 & condition_2
    ##周同比连接条件
    conditions_0_7 = condition_3 & condition_4 & condition_5 
    ##最终报表
    report = df_0.alias("t_0").join(df_1.alias("t_1"),conditions_0_1,"left_outer") \
                              .join(df_7.alias("t_7"),conditions_0_7,"left_outer") \
                              .select(F.regexp_replace(F.lit(__str_dt_0),"-","").cast("int").alias("date"),F.col("t_0.channelName").alias("channelName"),F.col("t_0.packageName").alias("packageName"),F.col("t_0.id_1").alias("id_1"), \
                              F.col("t_0.sellCount").alias("sellCount"),F.concat(F.round((F.col("t_0.sellCount")/F.col("t_1.sellCount")-1)*100,2),F.lit("%")).alias("sellCountCompareDay"),F.concat(F.round((F.col("t_0.sellCount")/F.col("t_7.sellCount")-1)*100,2),F.lit("%")).alias("sellCountCompareWeek"), \
                              F.col("t_0.moneyCount").alias("moneyCount"),F.concat(F.round((F.col("t_0.moneyCount")/F.col("t_1.moneyCount")-1)*100,2),F.lit("%")).alias("moneyCountCompareDay"),F.concat(F.round((F.col("t_0.moneyCount")/F.col("t_7.moneyCount")-1)*100,2),F.lit("%")).alias("moneyCountCompareWeek"))
    return report
Exemplo n.º 15
0
def glean_3(invoice_df):
    invoices = invoice_df.withColumn('invoice_year',
                                     funcs.year('invoice_date'))
    invoices = invoices.withColumn('invoice_month',
                                   funcs.month('invoice_date'))
    invoices = invoices.withColumn('invoice_quarter',
                                   funcs.quarter('invoice_date'))
    invoices = invoices.groupBy([
        'canonical_vendor_id', 'invoice_year', 'invoice_month',
        'invoice_quarter'
    ]).agg({
        'total_amount': 'sum',
        'invoice_date': 'max'
    }).sort('canonical_vendor_id', 'invoice_year', 'invoice_month')
    days = lambda i: i * 86400

    w = (Window.partitionBy('canonical_vendor_id').orderBy(
        funcs.col('max(invoice_date)').cast('long')).rangeBetween(
            -days(365), 0))

    invoices = invoices.withColumn('rolling_average_12m',
                                   funcs.avg("sum(total_amount)").over(w))

    glean3 = invoices[
        ((invoices['sum(total_amount)'] >= invoices['rolling_average_12m'] * 6)
         & (invoices['sum(total_amount)'] < 1000) &
         (invoices['sum(total_amount)'] >= 100)) |
        ((invoices['sum(total_amount)'] >= invoices['rolling_average_12m'] * 3)
         & (invoices['sum(total_amount)'] < 10000) &
         (invoices['sum(total_amount)'] >= 1000)) |
        ((invoices['sum(total_amount)'] >= invoices['rolling_average_12m'] *
          1.5) & (invoices['sum(total_amount)'] >= 10000))]

    glean3 = glean3.withColumn(
        'dollar_dif',
        funcs.col('sum(total_amount)') - funcs.col('rolling_average_12m'))
    glean3 = glean3.withColumn(
        'percent_dif',
        funcs.round(100 * funcs.col('dollar_dif') /
                    funcs.col('rolling_average_12m')))
    glean3 = glean3.withColumn('glean_location', funcs.lit('vendor'))
    glean3 = glean3.withColumn("glean_type",
                               funcs.lit('large_month_increase_mtd'))
    glean3 = glean3.withColumn(
        "glean_text",
        funcs.concat(funcs.lit('Monthly spend with '),
                     funcs.col('canonical_vendor_id'), funcs.lit(' is $'),
                     funcs.col('dollar_dif'), funcs.lit(' ('),
                     funcs.col('percent_dif'),
                     funcs.lit('%) higher than average')))

    glean3 = glean3.withColumn("invoice_id", funcs.lit('n/a'))
    glean3 = glean3.withColumn('glean_date', funcs.col('max(invoice_date)'))
    return glean3
Exemplo n.º 16
0
 def assign_sk(self, df: DataFrame, orderByCol: str):
     now = datetime.now()  # current date and time
     fmt = '%y%m%d%H'
     yymmddhh = now.strftime(fmt)
     df_with_row_num = df.withColumn(
         "row_num",
         row_number().over(Window.orderBy(col(orderByCol))))
     sk_df = df_with_row_num.select(
         concat(lit(yymmddhh), lpad(col("row_num"), 5,
                                    "0")).cast("long").alias("sys_sk"),
         col("*")).drop(col("row_num"))
     return sk_df
Exemplo n.º 17
0
 def generateAddressColumn(self):
     ''' Purpose : Create a new "Address" column by combining Place, Country, City, State and Zip
         Argument : Class Object
         Output : Returns the dataframe with new Column
     
     '''
     self.org_df = self.org_df.withColumn(
         'Address',
         concat(col('Place Name'), lit(', '), col('County'), lit(', '),
                col('City'), lit(', '), col('State'), lit(', '),
                col('Zip')))
     return self.org_df
Exemplo n.º 18
0
def format_columns(experiments_df):
    experiments_df = experiments_df.withColumn(
        "weight",
        when(col("weight").like("%.%"),
             col("weight")).otherwise(concat(col("weight"), lit(".0"))),
    )

    date_columns = [
        "date_of_birth", "weight_date", "date_of_experiment", "time_point"
    ]

    for column in date_columns:
        experiments_df = experiments_df.withColumn(
            column,
            when(
                col(column).rlike(
                    "[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z"),
                col(column),
            ).otherwise(concat(col(column), lit("T00:00:00Z"))),
        )
    return experiments_df
    def dataB_material_id(df: DataFrame) -> DataFrame:
        '''
        Calculates the material id with a specific rule for the data
        frame supplied.
        '''

        data_frame = df.withColumn(
            'material_id',
            F.concat(F.col("grade"), F.lit("_"), F.col("caliper"), F.lit("_"),
                     F.format_number(F.col("width"), 3), F.lit("_"),
                     F.format_number(F.col("length"), 3)))
        return data_frame
Exemplo n.º 20
0
 def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, str):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(
                 F.concat(left.spark.column, SF.lit(right)), field=left._internal.data_fields[0]
             ),
         )
     elif isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType):
         return column_op(F.concat)(left, right)
     else:
         raise TypeError("Addition can not be applied to given types.")
Exemplo n.º 21
0
def column_revalue(vcf):
    # info 값 수정 필요
    name_list = ["ID", "REF", "ALT", "INFO", "FORMAT"]
    for name in name_list:
        if name == "FORMAT":
            vcf = vcf.withColumn(
                name, F.array_sort(F.array_distinct(F.flatten(F.col(name)))))
            vcf = vcf.withColumn(
                name, F.concat(F.lit("GT:"), F.array_join(F.col(name), ":")))
        else:
            vcf = vcf.withColumn(name, F.array_max(F.col(name)))
    return vcf
Exemplo n.º 22
0
 def add_embryo_life_stage_acc(self, specimen_df: DataFrame):
     """
     Adds life stage to embryo specimen dataframe.
     """
     efo_acc_udf = udf(self.resolve_embryo_life_stage, StringType())
     specimen_df = specimen_df.withColumn(
         "developmental_stage_acc", efo_acc_udf("_stage")
     )
     specimen_df = specimen_df.withColumn(
         "developmental_stage_name", concat(lit("embryonic day "), col("_stage"))
     )
     return specimen_df
Exemplo n.º 23
0
 def _transform(self, df):
     df = df \
         .withColumn('rating', col('rating').cast(IntegerType())) \
         .dropna(subset=('rating', 'first_5_sentences', 'last_5_sentences')) \
         .withColumn('rating_class', ratingToClassUdf(col("rating")).cast(IntegerType())) \
         .withColumn('sentences', concat(
             col('first_5_sentences'),
             lit(' '),
             col('last_5_sentences')
         )) \
         .select('link', 'rating', 'rating_class', 'sentences')
     return df
Exemplo n.º 24
0
    def __radd__(self, other):
        # Handle 'literal' + df['col']
        if not isinstance(self.spark.data_type, StringType) and isinstance(other, str):
            raise TypeError("string addition can only be applied to string series or literals.")

        if isinstance(self.spark.data_type, StringType):
            if isinstance(other, str):
                return self._with_new_scol(F.concat(F.lit(other), self.spark.column))
            else:
                raise TypeError("string addition can only be applied to string series or literals.")
        else:
            return column_op(Column.__radd__)(self, other)
Exemplo n.º 25
0
 def prefix_3i_experiment_ids(self, dcc_df: DataFrame) -> DataFrame:
     """
     In  order to avoid collusion  of experiment IDs coming from 3i witht he  ones we generate on the IMPC data,
     we add a prefix the  3i experiment IDs.
     """
     return dcc_df.withColumn(
         "_experimentID",
         when(
             (dcc_df["_dataSource"] == "3i"),
             concat(lit("3i_"), dcc_df["_experimentID"]),
         ).otherwise(dcc_df["_experimentID"]),
     )
Exemplo n.º 26
0
def main():
    spark = SparkSession \
        .builder \
        .appName("PythonSparkStreamingKafka_RM_01") \
        .getOrCreate()
    # Create DataFrame representing the stream of input lines from connection to localhost:9999
    # Subscribe to 1 topic
    df = spark \
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", "localhost:9092") \
      .option("subscribe", "cdr-data") \
      .load()

    #Kafka streams from source are as "key":"value"..etc.
    df.printSchema()

    #Select key:value and discard others
    t = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

    # Split the value and make a dataframe with right column names
    t1 = t.select("value")
    t1.printSchema()
    split_col = functions.split(t['value'], ',')
    t1 = t1.withColumn('SourcePN', split_col.getItem(0))
    t1 = t1.withColumn('DestPN', split_col.getItem(1))
    t1 = t1.withColumn('Location', split_col.getItem(2))
    t1 = t1.withColumn('startT', split_col.getItem(3))
    t1 = t1.withColumn('endT', split_col.getItem(4))
    t1 = t1.withColumn('Type', split_col.getItem(5))
    t1 = t1.withColumn('Company', split_col.getItem(6))
    t1 = t1.drop('value')

    #Start Counting per second !!
    count_df = t1.groupBy("startT", "Location").count()
    count_df = count_df.withColumn('key',\
    functions.concat(functions.col("startT"),functions.lit(','),functions.col("Location")))
    #Kafka sink reads key:value.Make dataframe kafka stream sink compatible
    count_df = count_df.selectExpr("key", "count as value")

    #Send data to kafka sink
    query = count_df \
    .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
      .writeStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", "localhost:9092") \
      .option("topic", "my-topic") \
    .option("checkpointLocation", "/home/ubuntu/insight-project-arnab/test/")\
    .outputMode("complete") \
      .start()

    query.awaitTermination()
def shortest_path(v_from, v_to, df, max_path_length=10):
    """
        v_from - исходная вершина
        v_to - целевая вершина
        df - Spark DataFrame с ребрами графа
        max_path_length - максимальная длина пути
        
        Возвращает: pyspark.sql.DataFrame, состоящий из одного столбца с найдеными путями
    """
    temp_df = df.filter(df.follower_id == v_from)
    temp_df = temp_df.select(
        f.col('user_id').alias('last_neighbour'),
        f.col('follower_id').alias('path'))

    for i in range(max_path_length):
        if temp_df.filter(temp_df.last_neighbour.isin(v_to)).count() > 0:
            result_df = temp_df.filter(temp_df.last_neighbour.isin(v_to))\
                               .select(f.concat('path', f.lit(','), 'last_neighbour').alias('path'))
            return result_df
        temp_df = temp_df.join(df, temp_df.last_neighbour==df.follower_id, how="inner",)\
                         .select(f.column('user_id').alias('last_neighbour'),
                                 f.concat('path', f.lit(','), 'last_neighbour').alias('path'))
Exemplo n.º 28
0
def do_join(dataframe, col1, col2, sep, newCol):
    if not col1 in dataframe.columns:
        logging.warning("no such column {}".format(col1))
        return dataframe
    if not col2 in dataframe.columns:
        logging.warning("no such column {}".format(col2))
        return dataframe
    logging.info("joining {0} and  {1} into  {2}".format(col1, col2, newCol))

    df1 = dataframe.withColumn(col1, concat(col1, lit(sep), col2))
    df2 = df1.withColumnRenamed(col1, newCol)
    df1 = df2.drop(col2)
    return df1
Exemplo n.º 29
0
  def vwap(self, frequency='m',volume_col = "volume", price_col = "price"):
        # set pre_vwap as self or enrich with the frequency
        pre_vwap = self.df
        if frequency == 'm':
            pre_vwap = self.df.withColumn("time_group", f.concat(f.lpad(f.hour(f.col(self.ts_col)), 2, '0'), f.lit(':'),
                                                               f.lpad(f.minute(f.col(self.ts_col)), 2, '0')))
        elif frequency == 'H':
            pre_vwap = self.df.withColumn("time_group", f.concat(f.lpad(f.hour(f.col(self.ts_col)), 2, '0')))
        elif frequency == 'D':
            pre_vwap = self.df.withColumn("time_group", f.concat(f.lpad(f.day(f.col(self.ts_col)), 2, '0')))

        group_cols = ['time_group']
        if self.partitionCols:
          group_cols.extend(self.partitionCols)
        vwapped = ( pre_vwap.withColumn("dllr_value", f.col(price_col) * f.col(volume_col))
                            .groupby(group_cols)
                            .agg( sum('dllr_value').alias("dllr_value"),
                                  sum(volume_col).alias(volume_col),
                                  max(price_col).alias("_".join(["max",price_col])) )
                            .withColumn("vwap", f.col("dllr_value") / f.col(volume_col)) )

        return TSDF( vwapped, self.ts_col, self.partitionCols )
Exemplo n.º 30
0
def process_zip_code_state_csv(zip_code_state_csv_arg):
    zip_code_states_df = spark.read.csv(zip_code_state_csv_arg, header=True)

    zip_code_states_df = zip_code_states_df \
        .withColumn('state', functions.lower(zip_code_states_df['state'])) \
        .withColumn('county', functions.lower(zip_code_states_df['county']))
    zip_code_states_df = zip_code_states_df.withColumn(
        'combine',
        functions.concat(zip_code_states_df['county'],
                         zip_code_states_df['state']))
    zip_code_states_df = zip_code_states_df.select('zip_code', 'county', 'state', 'combine') \
        .filter(zip_code_states_df['state'].isin(required_states_abbr))
    return zip_code_states_df
  hour = station_time[0:2]
  minute = station_time[2:4]
  if int(hour) == 24:
    hour = "23"
    minute = "59"
  iso_time = "{hour}:{minute}:00".format(
    hour=hour,
    minute=minute
  )
  return iso_time

extract_time_udf = udf(crs_time_to_iso, StringType())

trimmed_flights = on_time_dataframe.select(
  "FlightNum",
  concat("FlightDate", lit("T"), extract_time_udf("CRSDepTime")).alias("CRSDepDatetime"),
  concat("FlightDate", lit("T"), extract_time_udf("CRSArrTime")).alias("CRSArrDatetime"),
  "FlightDate",
  "Origin",
  "Dest",
)

import iso8601
from datetime import timedelta
def increment_arrival_date(departure, arrival):
  """Handle overnight flights by incrementing the arrival date if a flight arrives earlier than it leaves"""
  d_dt = iso8601.parse_date(departure)
  a_dt = iso8601.parse_date(arrival)
  if a_dt.time() < d_dt.time():
    a_dt = a_dt + timedelta(days=1)
  return a_dt.isoformat()
Exemplo n.º 32
0
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import lit, concat, concat_ws, regexp_replace

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: trove-load.py <input json> <output parquet>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="Trove Load")
    sqlContext = SQLContext(sc)

    raw = sqlContext.read.json(sys.argv[1])
    df = raw.na.drop(subset=['id', 'fulltext']).dropDuplicates(['id'])
    df.select(concat(lit('trove/'), df.id).alias('id'),
              concat_ws('/', lit('trove'), df.titleId, df.date).alias('issue'),
              concat(lit('trove/'), df.titleId).alias('series'),
              df.date, df.firstPageId, df.firstPageSeq.cast('int').alias('seq'),
              df.heading.alias('title'), df.category,
              regexp_replace(regexp_replace(df.fulltext, '&', '&amp;'),
                             '<', '&lt;').alias('text'))\
      .write.save(sys.argv[2])

    sc.stop()
Exemplo n.º 33
0
def txtToPq_v2(inputFolder, pqFolder, pqFileName, searchString = "*.txt", append = True):
    """
    Read in all txt files in a folder, convert to parquet, and either append parquet or create new parquet
	This version is compatible with some of the v1.1 files inside s3://flight.price.11
	Main difference: leg1 is renamed to leg1	
    @params:
        inputFolder   - Required  : input folder that contains json line txt files (Str)        
        pqFolder      - Required  : folder to save the parquet files into (Str)        
        pqFileName    - Required  : parquet file name (Bool)        
        append        - Optional  : append to existing parquet or create new parquet 
        searchString  - Optional  : search string that identifies all the json line text files (Str)        
    """
    
    flightv1_1 = spark.read.json(os.path.join(inputFolder, searchString))
    
    flightv1_1_2 = (flightv1_1.withColumn('trip', col('trip').cast('string'))
                            .withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stayDays')))                    
                            .withColumn('depDate', to_date('depDate'))
                            .withColumn('searchDate', to_date('searchDate'))
                            .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later
                            .withColumn('airline_code', flightv1_1.leg1.carrierSummary.airlineCodes.getItem(0))                   
                            .withColumn('airline_codes', flightv1_1.leg1.carrierSummary.airlineCodes)                    
                            .withColumn('airline_codes_leg2', flightv1_1.leg2.carrierSummary.airlineCodes)                    
                            .withColumn('departureTime', flightv1_1.leg1.departureTime.isoStr)
                            .withColumn('departureTime_leg2', flightv1_1.leg2.departureTime.isoStr)
                            .withColumn('arrivalTime', flightv1_1.leg1.arrivalTime.isoStr)
                            .withColumn('arrivalTime_leg2', flightv1_1.leg2.arrivalTime.isoStr)
        #                 .withColumn('check_bag_inc', flightv1_1.leg1.arrivalTime)
                            .withColumn('airlineName', flightv1_1.leg1.carrierSummary.airlineName)
                            .withColumn('airlineName_leg2', flightv1_1.leg2.carrierSummary.airlineName)
                            .withColumn('duration_m', (F.unix_timestamp('arrivalTime', format=timeFmt) - 
                                                       F.unix_timestamp('departureTime', format=timeFmt))/60)                    
                        .withColumn('duration_m_leg2', (F.unix_timestamp('arrivalTime_leg2', format=timeFmt) - 
                                                       F.unix_timestamp('departureTime_leg2', format=timeFmt))/60)                    
        #                     .withColumn('duration', flightv1_1.timeline1.getItem(1).duration)
                        .withColumn('airlineCode', flightv1_1.timeline1.getItem(0).carrier.airlineCode)
                        .withColumn('flightNumber', flightv1_1.timeline1.getItem(0).carrier.flightNumber.cast('string'))                
                        .select('*', F.concat(col('airlineCode'), col('flightNumber')).alias('flight_code'))
                        .drop('airlineCode', 'flightNumber')
                        .withColumn('plane', flightv1_1.timeline1.getItem(0).carrier.plane)                
                        .withColumn('stops', flightv1_1.leg1.stops.cast('byte'))                                
                        .withColumn('stops_leg2', flightv1_1.leg2.stops.cast('byte'))                

        #                 .withColumn('stop_list', flightv1_1.leg1.stop_list)# need to do more work                
                        .withColumn('stop_airport', take_all_level1_str(flightv1_1.leg1.stop_list, lit('airport')))                                               
                        .withColumn('stop_duration', take_all_level1_str(flightv1_1.leg1.stop_list, lit('duration')))                                               

        #                 .withColumn('stop_list_leg2', flightv1_1.leg2.stop_list)               
                        .withColumn('stop_airport_leg2', take_all_level1_str(flightv1_1.leg2.stop_list, lit('airport')))                                               
                        .withColumn('stop_duration_leg2', take_all_level1_str(flightv1_1.leg2.stop_list, lit('duration')))                                               


                        .withColumn('noOfTicketsLeft', correct_tickets_left_UDF(flightv1_1.leg1.carrierSummary.noOfTicketsLeft))
                        .withColumn('noOfTicketsLeft', col('noOfTicketsLeft').cast('byte'))                
                        .withColumn('noOfTicketsLeft_leg2', correct_tickets_left_UDF(flightv1_1.leg2.carrierSummary.noOfTicketsLeft))
                        .withColumn('noOfTicketsLeft_leg2', col('noOfTicketsLeft_leg2').cast('byte'))
                        .withColumn('fromCityAirportCode', flightv1_1.leg1.departureLocation.airportCode)                
                        .withColumn('toCityAirportCode', flightv1_1.leg1.arrivalLocation.airportCode)
                        .withColumn('fromCityAirportCode_leg2', flightv1_1.leg2.departureLocation.airportCode)
                        .withColumn('toCityAirportCode_leg2', flightv1_1.leg2.arrivalLocation.airportCode)

                        # carrier leg 1
                        .withColumn('carrierAirProviderId', flightv1_1.leg1.carrierSummary.airProviderId)
                        .withColumn('carrierAirlineImageFileName', flightv1_1.leg1.carrierSummary.airlineImageFileName)
                        .withColumn('carrierMixedCabinClass', flightv1_1.leg1.carrierSummary.mixedCabinClass)
                        .withColumn('carrierMultiStop', flightv1_1.leg1.carrierSummary.multiStop)
                        .withColumn('carrierNextDayArrival', flightv1_1.leg1.carrierSummary.nextDayArrival)

                        # carrier leg 2
                        .withColumn('carrierAirProviderId_leg2', flightv1_1.leg2.carrierSummary.airProviderId)
                        .withColumn('carrierAirlineImageFileName_leg2', flightv1_1.leg2.carrierSummary.airlineImageFileName)
                        .withColumn('carrierMixedCabinClass_leg2', flightv1_1.leg2.carrierSummary.mixedCabinClass)
                        .withColumn('carrierMultiStop_leg2', flightv1_1.leg2.carrierSummary.multiStop)
                        .withColumn('carrierNextDayArrival_leg2', flightv1_1.leg2.carrierSummary.nextDayArrival)

                        ### Leg 1
                        ## Leg 1 departure
        #                 .withColumn('timeline_departureAirport', take_all_airport(flightv1_1.timeline1, lit('departureAirport')))                               
                        .withColumn('timeline_departureAirport_cityState', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('airportCityState')))
                        .withColumn('timeline_departureAirport_city', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('city')))
                        .withColumn('timeline_departureAirport_code', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('code')))
                        .withColumn('timeline_departureAirport_localName', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('localName')))
                        .withColumn('timeline_departureAirport_longName', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('longName')))
                        .withColumn('timeline_departureAirport_name', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('name')))

                        .withColumn('timeline_departureTime', take_all_level2_str(flightv1_1.timeline1, lit('departureTime'), lit('isoStr')))



                        ## Leg 1 arrival
                        .withColumn('timeline_arrivalAirport_cityState', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('airportCityState')))
                        .withColumn('timeline_arrivalAirport_city', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('city')))
                        .withColumn('timeline_arrivalAirport_code', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('code')))
                        .withColumn('timeline_arrivalAirport_localName', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('localName')))
                        .withColumn('timeline_arrivalAirport_longName', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('longName')))
                        .withColumn('timeline_arrivalAirport_name', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('name')))                

                        .withColumn('timeline_arrivalTime', take_all_level2_str(flightv1_1.timeline1, lit('arrivalTime'), lit('isoStr')))

                        # distance
                        .withColumn('timeline_distance', take_all_level2_str(flightv1_1.timeline1, lit('distance'), lit('formattedTotal')))

                        # carrier
                        .withColumn('timeline_plane', take_all_level2_str(flightv1_1.timeline1, lit('carrier'), lit('plane')))

                        # brandedFareName
                        .withColumn('timeline_brandedFareName', take_all_level1_str(flightv1_1.timeline1, lit('brandedFareName')))                               

                        # type
                        .withColumn('timeline_type', take_all_level1_str(flightv1_1.timeline1, lit('type')))                               

                        ### Leg 2
                        ## Leg 2 departure
                        .withColumn('timeline_departureAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('airportCityState')))
                        .withColumn('timeline_departureAirport_city_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('city')))
                        .withColumn('timeline_departureAirport_code_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('code')))
                        .withColumn('timeline_departureAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('localName')))
                        .withColumn('timeline_departureAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('longName')))
                        .withColumn('timeline_departureAirport_name_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('name')))

                        .withColumn('timeline_departureTime_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureTime'), lit('isoStr')))                


                        ## Leg 2 arrival
                        .withColumn('timeline_arrivalAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('airportCityState')))
                        .withColumn('timeline_arrivalAirport_city_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('city')))
                        .withColumn('timeline_arrivalAirport_code_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('code')))
                        .withColumn('timeline_arrivalAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('localName')))
                        .withColumn('timeline_arrivalAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('longName')))
                        .withColumn('timeline_arrivalAirport_name_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('name')))                

                        .withColumn('timeline_arrivalTime_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalTime'), lit('isoStr')))

                        # distance
                        .withColumn('timeline_distance_leg2', take_all_level2_str(flightv1_1.timeline2, lit('distance'), lit('formattedTotal')))

                        # carrier
                        .withColumn('timeline_plane_leg2', take_all_level2_str(flightv1_1.timeline2, lit('carrier'), lit('plane')))

                        # brandedFareName
                        .withColumn('timeline_brandedFareName_leg2', take_all_level1_str(flightv1_1.timeline2, lit('brandedFareName')))                           

                        # type
                        .withColumn('timeline_type_leg2', take_all_level1_str(flightv1_1.timeline2, lit('type')))                               

                        # create variables droppped from v1.0
                        .withColumn('span_days', lit(99))
                        .withColumn('power', lit(False))
                        .withColumn('video', lit(False))
                        .withColumn('wifi', lit(False))
                        .withColumn('stop_info', col('stop_airport')) #placeholder. can't figure out how to create struct literal


                        .select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 
                                'fromCity', 'toCity', 'trip', 'depDate', 'retDate',
                                'stayDays', 
                               'departureTime', 'arrivalTime', 'departureTime_leg2', 'arrivalTime_leg2',
                                'airlineName', 'airlineName_leg2', 'duration_m', 'duration_m_leg2',                
                                'flight_code', 'plane', 'stops', 'stops_leg2', 'stop_airport', 'stop_duration', 'stop_airport_leg2', 'stop_duration_leg2',
                                'noOfTicketsLeft', 'noOfTicketsLeft_leg2',
                               'airline_code', 'airline_codes', 'airline_codes_leg2', 
                                'fromCityAirportCode', 'toCityAirportCode', 'fromCityAirportCode_leg2', 'toCityAirportCode_leg2',
                               'carrierAirProviderId', 'carrierAirlineImageFileName', 'carrierMixedCabinClass', 'carrierMultiStop', 'carrierNextDayArrival',
                                'carrierAirProviderId_leg2', 'carrierAirlineImageFileName_leg2', 'carrierMixedCabinClass_leg2', 'carrierMultiStop_leg2', 'carrierNextDayArrival_leg2',
                                #'url',

                                ## leg 1
                                # departure
                                'timeline_departureAirport_cityState', 'timeline_departureAirport_city', 'timeline_departureAirport_code', 'timeline_departureAirport_localName', 
                                'timeline_departureAirport_longName', 'timeline_departureAirport_name',

                                'timeline_departureTime',

                                # arrival
                                'timeline_arrivalAirport_cityState', 'timeline_arrivalAirport_city', 'timeline_arrivalAirport_code', 'timeline_arrivalAirport_localName', 
                                'timeline_arrivalAirport_longName', 'timeline_arrivalAirport_name',

                                'timeline_arrivalTime',

                                'timeline_distance',
                                'timeline_plane',
                                'timeline_brandedFareName',
                                'timeline_type',

                                ## leg 2                        
                                # departure
                                'timeline_departureAirport_cityState_leg2', 'timeline_departureAirport_city_leg2', 'timeline_departureAirport_code_leg2', 'timeline_departureAirport_localName_leg2', 
                                'timeline_departureAirport_longName_leg2', 'timeline_departureAirport_name_leg2',

                                'timeline_departureTime_leg2',

                                # arrival
                                'timeline_arrivalAirport_cityState_leg2', 'timeline_arrivalAirport_city_leg2', 'timeline_arrivalAirport_code_leg2', 'timeline_arrivalAirport_localName_leg2', 
                                'timeline_arrivalAirport_longName_leg2', 'timeline_arrivalAirport_name_leg2',

                                'timeline_arrivalTime_leg2',

                                'timeline_distance_leg2',
                                'timeline_plane_leg2',
                                'timeline_brandedFareName_leg2',
                                'timeline_type_leg2',

                                # variables dropped from v1.0
                                'span_days', 'power', 'video', 'wifi', 'stop_info'
                               )                
                       )


    if append:
        flightv1_1_2.repartition(1).write.mode('append').parquet(os.path.join(pqFolder, pqFileName))        
    else:
        flightv1_1_2.repartition(1).write.parquet(os.path.join(pqFolder, pqFileName))   
Exemplo n.º 34
0
# MAGIC 
# MAGIC Let's create a new DataFrame from `wordsDF` by performing an operation that adds an 's' to each word.  To do this, we'll call the [`select` DataFrame function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.select) and pass in a column that has the recipe for adding an 's' to our existing column.  To generate this `Column` object you should use the [`concat` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.concat) found in the [`pyspark.sql.functions` module](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions).  Note that `concat` takes in two or more string columns and returns a single string column.  In order to pass in a constant or literal value like 's', you'll need to wrap that value with the [`lit` column function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.lit).
# MAGIC 
# MAGIC Please replace `<FILL IN>` with your solution.  After you have created `pluralDF` you can run the next cell which contains two tests.  If you implementation is correct it will print `1 test passed` for each test.
# MAGIC 
# MAGIC This is the general form that exercises will take.  Exercises will include an explanation of what is expected, followed by code cells where one cell will have one or more `<FILL IN>` sections.  The cell that needs to be modified will have `# TODO: Replace <FILL IN> with appropriate code` on its first line.  Once the `<FILL IN>` sections are updated and the code is run, the test cell can then be run to verify the correctness of your solution.  The last code cell before the next markdown section will contain the tests.
# MAGIC 
# MAGIC > Note:
# MAGIC > Make sure that the resulting DataFrame has one column which is named 'word'.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import lit, concat

pluralDF = wordsDF.select(concat(wordsDF.word, lit('s')).alias('word'))
pluralDF.show()

# COMMAND ----------

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from databricks_test_helper import Test
# TEST Using DataFrame functions to add an 's' (1b)
Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s')
Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC 
# MAGIC Let's create a new DataFrame from `wordsDF` by performing an operation that adds an 's' to each word.  To do this, we'll call the [`select` DataFrame function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.select) and pass in a column that has the recipe for adding an 's' to our existing column.  To generate this `Column` object you should use the [`concat` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.concat) found in the [`pyspark.sql.functions` module](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions).  Note that `concat` takes in two or more string columns and returns a single string column.  In order to pass in a constant or literal value like 's', you'll need to wrap that value with the [`lit` column function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.lit).
# MAGIC 
# MAGIC Please replace `<FILL IN>` with your solution.  After you have created `pluralDF` you can run the next cell which contains two tests.  If you implementation is correct it will print `1 test passed` for each test.
# MAGIC 
# MAGIC This is the general form that exercises will take.  Exercises will include an explanation of what is expected, followed by code cells where one cell will have one or more `<FILL IN>` sections.  The cell that needs to be modified will have `# TODO: Replace <FILL IN> with appropriate code` on its first line.  Once the `<FILL IN>` sections are updated and the code is run, the test cell can then be run to verify the correctness of your solution.  The last code cell before the next markdown section will contain the tests.
# MAGIC 
# MAGIC > Note:
# MAGIC > Make sure that the resulting DataFrame has one column which is named 'word'.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import lit, concat

pluralDF = wordsDF.select(concat(wordsDF.word,lit("s")).alias("word"))
pluralDF.show()

# COMMAND ----------

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from databricks_test_helper import Test
# TEST Using DataFrame functions to add an 's' (1b)
Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s')
Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'")

# COMMAND ----------

# MAGIC %md
Exemplo n.º 36
0
def applyModel(fileName, loadModelName, outlierPercentile = 100):

    sc = SparkContext( 'local', 'pyspark')
    sqlContext = SQLContext(sc)

    #########
    # load data
    #########

    data = sc.textFile(fileName)
    #extract header and remove it
    header = data.first()
    data = data.filter(lambda x:x !=header).cache()
    header = header.split('\t')
    #parse data
    data = data.map(lambda x : x.split('\t'))

    #########
    # prepare features
    #########

    df = sqlContext.createDataFrame(data, header)
    df = (df.withColumn("ADLOADINGTIME",func.regexp_replace('ADLOADINGTIME', 'null', '0').cast('float'))
         .withColumn("TIMESTAMP",func.regexp_replace('TIMESTAMP', 'null', '0').cast('int'))
         .withColumn("GEOIP_LAT",func.regexp_replace('GEOIP_LAT', 'null', '0').cast('int'))
          .withColumn("GEOIP_LNG",func.regexp_replace('GEOIP_LNG', 'null', '0').cast('int'))
          .withColumn("HOSTWINDOWHEIGHT",func.regexp_replace('HOSTWINDOWHEIGHT', 'null', '0').cast('int'))
          .withColumn("HOSTWINDOWWIDTH",func.regexp_replace('HOSTWINDOWWIDTH', 'null', '0').cast('int'))
          .withColumn("TOPMOSTREACHABLEWINDOWHEIGHT",func.regexp_replace('TOPMOSTREACHABLEWINDOWHEIGHT', 'null', '0').cast('int'))
          .withColumn("TOPMOSTREACHABLEWINDOWWIDTH",func.regexp_replace('TOPMOSTREACHABLEWINDOWWIDTH', 'null', '0').cast('int'))
         )
    thr = np.percentile(df.select("ADLOADINGTIME").rdd.collect(), outlierPercentile)
    df = df.filter(func.col('ADLOADINGTIME') < thr)
    df = df.withColumn("TOPMOSTREACHABLEWINDOWAREA", func.col("TOPMOSTREACHABLEWINDOWHEIGHT")*func.col("TOPMOSTREACHABLEWINDOWWIDTH"))
    df = df.withColumn("INTENDENTISACTUALDEVICETYPE", (func.col("ACTUALDEVICETYPE")==func.col("INTENDEDDEVICETYPE")).cast('int'))
    df = df.withColumn("COMBINEDID", 
            func.concat(
                func.col('ACCOUNTID'), 
                func.col('CAMPAIGNID'), 
                func.col('CREATIVEID'), 
                func.col('SDK')) )

    #df = df.withColumn("COMBINEDID", func.regexp_replace("COMBINEDID", '^$', 'NA'))

    df = df.withColumn("COMBINEDEXTERNALID", 
            func.concat( 
                func.regexp_replace('EXTERNALADSERVER', 'null', ''), 
                func.regexp_replace('EXTERNALPLACEMENTID', 'null', ''), 
                func.regexp_replace('EXTERNALSITEID', 'null', ''), 
                func.regexp_replace('EXTERNALSUPPLIERID', 'null', '') ))

    #df = df.withColumn("COMBINEDEXTERNALID", func.regexp_replace("COMBINEDEXTERNALID", '^$', 'NA'))

    df = df.withColumn("PLATFORMCOMBINED", 
            func.concat( 
                func.regexp_replace('PLATFORM', 'null', ''), 
                func.regexp_replace('PLATFORMVERSION', 'null', '') ))

    #df = df.withColumn("PLATFORMCOMBINED", func.regexp_replace("PLATFORMCOMBINED", '^$', 'NA'))

    df = df.withColumn("UA_OSCOMB", 
            func.concat( 
                func.regexp_replace('UA_OS', 'null', ''), 
                func.regexp_replace('UA_OSVERSION', 'null', '') ))

    #df = df.withColumn("UA_OSCOMB", func.regexp_replace("UA_OSCOMB", '^$', 'NA'))
    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON', '[^,\d]', '') )

    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON_SIZE', '^,', '') )

    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON_SIZE', ',,', ',') )

    udf = func.udf(lambda x: int(np.fromstring(x,dtype=int, sep=',').sum()), IntegerType())
    df = df.withColumn("FILESJSON_SIZE", udf("FILESJSON_SIZE"))

    print('Loaded and prapared %d entries' % df.count())

    #########
    # keep only needed features
    #########   

    features = ['ADLOADINGTIME',
     'PLACEMENTID',
     'TIMESTAMP',
     'CREATIVETYPE',
     'UA_HARDWARETYPE',
     'UA_VENDOR',
     'UA_MODEL',
     'UA_BROWSER',
     'UA_BROWSERVERSION',
     'FILESJSON',
     'ERRORSJSON',
     'TOPMOSTREACHABLEWINDOWAREA',
     'FILESJSON_SIZE',
     'COMBINEDID',
     'COMBINEDEXTERNALID',
     'PLATFORMCOMBINED',
     'UA_OSCOMB',
     'SDK',
     'EXTERNALADSERVER'
       ]

    df = df.select(features)

    #########
    # Convert categorical features to numerical
    #########   


    featuresCat = [
     'PLACEMENTID',
     'CREATIVETYPE',
     'UA_HARDWARETYPE',
     'UA_VENDOR',
     'UA_MODEL',
     'UA_BROWSER',
     'UA_BROWSERVERSION',
     'FILESJSON',
     'ERRORSJSON',
     'COMBINEDID',
     'COMBINEDEXTERNALID',
     'PLATFORMCOMBINED',
     'UA_OSCOMB',
     'SDK',
     'EXTERNALADSERVER'
       ]

    for i in range(len(featuresCat)):

        indexer = StringIndexer(inputCol=featuresCat[i], outputCol='_'+featuresCat[i]).setHandleInvalid("skip").fit(df)
        df = indexer.transform(df).drop(featuresCat[i])
        writer = indexer._call_java("write")
        writer.overwrite().save("indexer_" + featuresCat[i])    

    featuresCat = [ '_' + featuresCat[i] for i in range(len(featuresCat))]    

    features = featuresCat[:]
    features.append('TIMESTAMP')    
    features.append('FILESJSON_SIZE')
    features.append('TOPMOSTREACHABLEWINDOWAREA')


    #########
    # Assemble features
    #########   


    assembler = VectorAssembler(
        inputCols=features,
        outputCol="features")

    df = assembler.transform(df)

    #########
    # Convert to labeled point
    #########   


    lp = (df.select(func.col("ADLOADINGTIME").alias("label"), func.col("features"))
      .map(lambda row: LabeledPoint(row.label, row.features)))
    lp.cache()


    #########
    # Load trained model
    #########
    
    model = RandomForestModel.load(sc, loadModelName)
    
    print('Model loaded!')
    
    predictions = model.predict(lp.map(lambda x: x.features)).collect()
    
    return predictions
  def classify_prediction_requests(rdd):
  
    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
  
    prediction_request_schema = StructType([
      StructField("Carrier", StringType(), True),
      StructField("DayOfMonth", IntegerType(), True),
      StructField("DayOfWeek", IntegerType(), True),
      StructField("DayOfYear", IntegerType(), True),
      StructField("DepDelay", DoubleType(), True),
      StructField("Dest", StringType(), True),
      StructField("Distance", DoubleType(), True),
      StructField("FlightDate", DateType(), True),
      StructField("FlightNum", StringType(), True),
      StructField("Origin", StringType(), True),
      StructField("Timestamp", TimestampType(), True),
      StructField("UUID", StringType(), True),
    ])
    
    prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema)
    prediction_requests_df.show()

    #
    # Add a Route variable to replace FlightNum
    #

    from pyspark.sql.functions import lit, concat
    prediction_requests_with_route = prediction_requests_df.withColumn(
      'Route',
      concat(
        prediction_requests_df.Origin,
        lit('-'),
        prediction_requests_df.Dest
      )
    )
    prediction_requests_with_route.show(6)
  
    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
      string_indexer_model = string_indexer_models[column]
      prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route)
  
    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(prediction_requests_with_route)
    
    # Inspect the vectors
    final_vectorized_features.show()
  
    # Drop the individual index columns
    index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"]
    for column in index_columns:
      final_vectorized_features = final_vectorized_features.drop(column)
  
    # Inspect the finalized features
    final_vectorized_features.show()
  
    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)
  
    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability")
  
    # Inspect the output
    final_predictions.show()
  
    # Store to Mongo
    if final_predictions.count() > 0:
      final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
        "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
      )
    minute=minute
  )
  return iso_time

extract_time_udf = udf(station_time_to_iso, StringType())
hourly_weather_with_iso_time = hourly_weather_with_iso_date.withColumn(
  "ISOTime",
  extract_time_udf(trimmed_hourly_records.Time)
)

from pyspark.sql.functions import concat, lit
hourly_weather_with_iso_datetime = hourly_weather_with_iso_time.withColumn(
  "Datetime",
  concat(
    hourly_weather_with_iso_time.ISODate,
    lit("T"),
    hourly_weather_with_iso_time.ISOTime
  )
)

#
# Trim the final records, lose the original Date/Time fields and save
#
from pyspark.sql.functions import col
final_hourly_columns = ["WBAN", col("ISODate").alias("Date"), "Datetime", "SkyCondition",
                        "Visibility", "WeatherType", "DryBulbCelsius",
                        "WetBulbCelsius", "DewPointCelsius",
                        "RelativeHumidity", "WindSpeed", "WindDirection",
                        "ValueForWindCharacter", "StationPressure",
                        "SeaLevelPressure", "HourlyPrecip", "Altimeter"]
final_trimmed_hourly_records = hourly_weather_with_iso_datetime.select(
Exemplo n.º 39
0
less_ten = udf(lambda s: s < 10, BooleanType()) # A UDF is a special wrapper around a function, allowing the function to be used in a DataFrame query
lambdaDF = subDF.filter(less_ten(subDF.age))
lambdaDF.show()
lambdaDF.count()

# Let's further filter so that we only select even ages
even = udf(lambda s: s % 2 == 0, BooleanType())
evenDF = lambdaDF.filter(even(lambdaDF.age))
evenDF.show()
evenDF.count()

# neat way of writing code
from pyspark.sql.functions import *
(dataDF
 .filter(dataDF.age > 20)
 .select(concat(dataDF.first_name, lit(' '), dataDF.last_name), dataDF.occupation)
 .show(truncate=False)
 )

# Want to see the first 4 rows of a dataframe? Use the take() function
display(filteredDF.take(4)) # take is an action
# if you just want to see the first row you can use the first() function
display(filteredDF.take(1)) # or filteredDF.first()

# distinct() filters out duplicate rows, and it considers all columns
print dataDF.distinct().count()

# distinct values in a column
unique_columnData_count = logsDF.select('column').distinct().count()

# dropDuplicates() is like distinct(), except that it allows us to specify the columns to compare.
 logger = sc._jvm.org.apache.log4j
 logger.LogManager.getRootLogger().setLevel(logger.Level.FATAL)
 
 sqlContext = SQLContext(sc)
 u_employee_trans = udf( employee_trans )
 
 # Load the data
 url_String='jdbc:oracle:thin:apps/[email protected]:1524/TESTDEV'
 if (db_location=='PROD'):
     url_String='jdbc:oracle:thin:apps/[email protected]:1524/PROD'
     
 tblname="(select * from (select v.legal_entity_id as org_id, org.name as org_name, v.dept_id as dept_id, dept.name as dept_name, v.emp_id as emp_id, emp.emp_name, emp.employee_number as emp_number, v.sub_hours, f.ferial_name, v.leave_date from narl_leave_detail_info_v v, narl_leave_main m, narl_ferial_header f, narl_login_emp_info_hist_v emp, HR_ALL_ORGANIZATION_UNITS org, HR_ALL_ORGANIZATION_UNITS dept where v.leave_id=m.leave_id and m.ferial_code=f.ferial_code and v.emp_id=emp.employee_id and v.legal_entity_id=org.ORGANIZATION_ID and v.dept_id=dept.ORGANIZATION_ID and v.status in ('APPROVE','INPROCESS','PROCESSING','FREE') and TO_CHAR(v.leave_date,'YYYY')='%s') ORDER BY org_id, dept_id, emp_id) tmp" %strYear
 
 df= sqlContext.read.format('jdbc').options(url=url_String, dbtable=tblname).load() 
 #oracle 取出值,其欄位都是大寫
 df = df.select(df.ORG_ID.cast('int').alias('org_id'),df.ORG_NAME.alias('org_name'), df.DEPT_ID.cast('int').alias('dept_id'), df.DEPT_NAME.alias('dept_name'), df.EMP_ID.cast('int').alias('emp_id'), df.EMP_NAME.alias('emp_name'),df.EMP_NUMBER.alias('emp_number'),date_format(df.LEAVE_DATE, 'E').alias('name_day'),concat(lit('Day_'),date_format(df.LEAVE_DATE,'dd')).alias('day_month') ,df.FERIAL_NAME.alias('ferial_name'), df.SUB_HOURS.cast('int').alias('sub_hours'))
 
 df = df.withColumn( 'employee_num', u_employee_trans('emp_number') ).drop('emp_number')
 df = df.withColumnRenamed("employee_num", "emp_number")
 df.cache()
 
 #Load org by WEEK data--start
 print 'start Load org by WEEK data>>', datetime.datetime.now()
 df_groupBy_org_name_day = df.select('org_id','org_name','name_day','ferial_name', 'sub_hours').groupBy('org_id','org_name','name_day').pivot("ferial_name",['特別休假','加班或假日出差轉補休','生理假','傷病假','婚假', '家庭照顧假','事假', '產檢假','陪產假','產假','喪假','國內公假','國外公假','公傷病假','安胎假']).sum('sub_hours')
 
 df_groupBy_org_name_day=df_groupBy_org_name_day.fillna(0)
 
 df_groupBy_org_name_day = df_groupBy_org_name_day.select('org_id','org_name','name_day', '特別休假','加班或假日出差轉補休','生理假','傷病假','婚假','家庭照顧假', '事假','產檢假', '陪產假','產假','喪假','國內公假','國外公假','公傷病假', '安胎假').groupBy('org_id','org_name').pivot("name_day", ['Mon', 'Tue', 'Wed','Thu', 'Fri', 'Sat','Sun']).sum('特別休假', '加班或假日出差轉補休','生理假','傷病假','婚假','家庭照顧假','事假','產檢假', '陪產假','產假','喪假','國內公假','國外公假','公傷病假', '安胎假')
 
 df_groupBy_org_name_day=df_groupBy_org_name_day.fillna(0)
 df_groupBy_org_name_day=df_groupBy_org_name_day.orderBy(df_groupBy_org_name_day.org_id)   
Exemplo n.º 41
0
# Check for nulls in features before using Spark ML
#
null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns]
cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
print(list(cols_with_nulls))

#
# Add a Route variable to replace FlightNum
#
from pyspark.sql.functions import lit, concat

features_with_route = features.withColumn(
  'Route',
  concat(
    features.Origin,
    lit('-'),
    features.Dest
  )
)
features_with_route.select("Origin", "Dest", "Route").show(5)

#
# Categorize or 'bucketize' the arrival delay field using a DataFrame UDF
#
def bucketize_arr_delay(arr_delay):
  bucket = None
  if arr_delay <= -15.0:
    bucket = 0.0
  elif arr_delay > -15.0 and arr_delay <= 0.0:
    bucket = 1.0
  elif arr_delay > 0.0 and arr_delay <= 30.0:
Exemplo n.º 42
0
    def pm_dashboard(self):
        if self.sccmDF is None:
            self.sccmDF = self.session.read.json('/user/jleaniz/sft_vuln_raw.json').cache()

        self.sc.setLocalProperty("spark.scheduler.pool", "dashboard")

        self.sccmDF = self.sccmDF.filter('crit_X_cat="High"')
        df_most_vuln = self.sccmDF.select('DisplayName0','Version0').groupBy('DisplayName0','Version0').count().orderBy(desc('count')).limit(10)\
            .select(concat(col("DisplayName0"),lit(" "),col("Version0")),"count").withColumnRenamed("concat(DisplayName0,  , Version0)", "software").collect()
        df_ncsa_most_vuln = self.sccmDF.filter('Region_X="NCSA" and Zone_X="Corp"').select('DisplayName0','Version0').groupBy('DisplayName0','Version0').count().orderBy(desc('count')).limit(10)\
            .select(concat(col("DisplayName0"),lit(" "),col("Version0")),"count").withColumnRenamed("concat(DisplayName0,  , Version0)", "software").collect()
        df_emea_most_vuln = self.sccmDF.filter('Region_X="EMEA" and Zone_X="Corp"').select('DisplayName0','Version0').groupBy('DisplayName0','Version0').count().orderBy(desc('count')).limit(10)\
            .select(concat(col("DisplayName0"),lit(" "),col("Version0")),"count").withColumnRenamed("concat(DisplayName0,  , Version0)", "software").collect()
        df_apac_most_vuln = self.sccmDF.filter('Region_X="APAC" and Zone_X="Corp"').select('DisplayName0','Version0').groupBy('DisplayName0','Version0').count().orderBy(desc('count')).limit(10)\
        .select(concat(col("DisplayName0"),lit(" "),col("Version0")),"count").withColumnRenamed("concat(DisplayName0,  , Version0)", "software").collect()
        df_most_vuln_onbe = self.sccmDF.filter('Zone_X="ONBE"').select('DisplayName0','Version0').groupBy('DisplayName0','Version0').count().orderBy(desc('count')).limit(10)\
        .select(concat(col("DisplayName0"),lit(" "),col("Version0")),"count").withColumnRenamed("concat(DisplayName0,  , Version0)", "software").collect()
        df_most_vuln_corp = self.sccmDF.filter('Zone_X="Corp"').select('DisplayName0','Version0').groupBy('DisplayName0','Version0').count().orderBy(desc('count')).limit(10)\
        .select(concat(col("DisplayName0"),lit(" "),col("Version0")),"count").withColumnRenamed("concat(DisplayName0,  , Version0)", "software").collect()
        df_most_vuln_func = self.sccmDF.select('HostFn_X').groupBy('HostFn_X').count().orderBy(desc('count')).limit(10).collect()
        df_per_site_vuln = self.session.read.json('/user/jleaniz/pl_dashboard.json').collect()

        dataChart = []
        descriptionChart = [
            ('Region', "string"),
            ('Adobe-Flash', "number"),
            ('Adobe-Reader', 'number'),
            ('Chrome', 'number'),
            ('Firefox', 'number'),
            ('Java', 'number')
        ]

        mydict = df_per_site_vuln[0].asDict()

        dataChart.append(
        ["APAC", mydict['adobe-flash'].APAC, mydict['adobe-rdr'].APAC, mydict['chrome'].APAC,mydict['firefox'].APAC,mydict['java'].APAC]
        )
        dataChart.append(
        ["EMEA", mydict['adobe-flash'].EMEA, mydict['adobe-rdr'].EMEA, mydict['chrome'].EMEA,mydict['firefox'].EMEA,mydict['java'].EMEA]
        )
        dataChart.append(
        ["NCSA", mydict['adobe-flash'].NCSA, mydict['adobe-rdr'].NCSA, mydict['chrome'].NCSA,mydict['firefox'].NCSA,mydict['java'].NCSA]
        )

        data_tableChart = gviz_api.DataTable(descriptionChart)
        data_tableChart.LoadData(dataChart)
        json_per_site_vuln = data_tableChart.ToJSon()

        dataChart = []
        descriptionChart = {
            "software": ("string", "Software"),
            "hits": ("number", "Hosts")
        }
        for row in df_most_vuln:
            dataChart.append(
                {
                    "software": row.software,
                    "hits": int(row[1])
                }
            )
        data_tableChart = gviz_api.DataTable(descriptionChart)
        data_tableChart.LoadData(dataChart)
        json_most_vuln = data_tableChart.ToJSon(
            columns_order=("software","hits"),
            order_by="hits"
        )

        dataChart = []
        for row in df_ncsa_most_vuln:
            dataChart.append(
                {
                    "software": row.software,
                    "hits": int(row[1])
                }
            )
        data_tableChart.LoadData(dataChart)
        json_most_vuln_ncsa = data_tableChart.ToJSon(
            columns_order=("software","hits"),
            order_by="hits"
        )

        dataChart = []
        for row in df_emea_most_vuln:
            dataChart.append(
                {
                    "software": row.software,
                    "hits": int(row[1])
                }
            )
        data_tableChart.LoadData(dataChart)
        json_most_vuln_emea = data_tableChart.ToJSon(
            columns_order=("software","hits"),
            order_by="hits"
        )

        dataChart = []
        for row in df_apac_most_vuln:
            dataChart.append(
                {
                    "software": row.software,
                    "hits": int(row[1])
                }
            )
        data_tableChart.LoadData(dataChart)
        json_most_vuln_apac = data_tableChart.ToJSon(
            columns_order=("software","hits"),
            order_by="hits"
        )

        dataChart = []
        for row in df_most_vuln_onbe:
            dataChart.append(
                {
                    "software": row.software,
                    "hits": int(row[1])
                }
            )
        data_tableChart.LoadData(dataChart)
        json_most_vuln_onbe = data_tableChart.ToJSon(
            columns_order=("software","hits"),
            order_by="hits"
        )

        dataChart = []
        for row in df_most_vuln_corp:
            dataChart.append(
                {
                    "software": row.software,
                    "hits": int(row[1])
                }
            )
        data_tableChart.LoadData(dataChart)
        json_most_vuln_corp = data_tableChart.ToJSon(
            columns_order=("software","hits"),
            order_by="hits"
        )

        dataChart = []
        descriptionChart = {
            "function": ("string", "Host function"),
            "hits": ("number", "Hits")
        }
        for row in df_most_vuln_func:
            dataChart.append(
                {
                    "function": row.HostFn_X,
                    "hits": int(row[1])
                }
            )
        data_tableChart = gviz_api.DataTable(descriptionChart)
        data_tableChart.LoadData(dataChart)
        json_most_vuln_func = data_tableChart.ToJSon(
            columns_order=("function","hits"),
            order_by="hits"
        )
        return json_per_site_vuln, json_most_vuln,json_most_vuln_ncsa,json_most_vuln_emea,json_most_vuln_apac,json_most_vuln_onbe,json_most_vuln_corp,json_most_vuln_func
Exemplo n.º 43
0
# COMMAND ----------

bad_content_size_df = base_df.filter(~ base_df['value'].rlike(r'\d+$'))
bad_content_size_df.count()

# COMMAND ----------

# MAGIC %md
# MAGIC That's it! The count matches the number of rows in `bad_rows_df` exactly.
# MAGIC 
# MAGIC Let's take a look at some of the bad column values. Since it's possible that the rows end in extra white space, we'll tack a marker character onto the end of each line, to make it easier to see trailing white space.

# COMMAND ----------

from pyspark.sql.functions import lit, concat
bad_content_size_df.select(concat(bad_content_size_df['value'], lit('*'))).show(truncate=False)

# COMMAND ----------

# MAGIC %md
# MAGIC Ah. The bad rows correspond to error results, where no content was sent back and the server emitted a "`-`" for the `content_size` field. Since we don't want to discard those rows from our analysis, let's map them to 0.

# COMMAND ----------

# MAGIC %md
# MAGIC ### (2d) Fix the rows with null content\_size
# MAGIC 
# MAGIC The easiest solution is to replace the null values in `split_df` with 0. The DataFrame API provides a set of functions and fields specifically designed for working with null values, among them:
# MAGIC 
# MAGIC * [fillna()](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.fillna), which fills null values with specified non-null values.
# MAGIC * [na](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.na), which returns a [DataFrameNaFunctions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameNaFunctions) object with many functions for operating on null columns.
Exemplo n.º 44
0
Aggregate time into one minutes and join all the features


# assign time scale in order to aggregate data into it 
#time_interval = 60
#start_timestep = 1435708800 - 7200 # 2015-07-01 00:00:00 2 hours difference 
#data = (data
#        .withColumn('timestep', F.ceil((F.unix_timestamp('dt')-sc._jsc.startTime())/time_interval))
#        .drop('radar_id')
#        )
# assign each location to the cells index
track_grid_x = track_grid.withColumn('x_categories', F.ceil((F.col('position_x') - min_lon)/interval_lon))
data = track_grid_x.withColumn('y_categories', F.ceil((F.col('position_y') - min_lat)/interval_lat))
data = data.fillna(0).drop('radar_id')
data = data.withColumn("location_index",F.concat(data.y_categories,data.x_categories)).drop('x_categories').drop('y_categories')
# join the attribute features
data_count=data.groupBy('location_index', 'dt').count()
attribute=data.groupBy('location_index', 'dt').mean('position_x','position_y',
                                                     'velocity','airspeed',
                                                     'heading','heading_vertical',
                                                     'peak_mass','mass','mass_correction')
cond = [data_count.location_index == attribute.location_index, 
        data_count.dt == attribute.dt]
data_grid = (attribute.join(data_count, cond, 'inner')
                 .drop(attribute.dt)
                 .drop(attribute.location_index)
             )
data_grid = data_grid.orderBy('dt','location_index')
oldColumns = data_grid.schema.names
newColumns = ["position_x","position_y","velocity","airspeed","heading","heading_vertical",