예제 #1
0
def time_series_dataframe(dataframe,time_field):
     # Parameters:
        # datadrame: must be a spark dataframe
        # field: String value. It must match a single column name
    # About nulls values: This function does not consider null values on its calculations
    # Importing Libraries and Modules
    import pyspark.sql.functions as F
    # Dropping null values. Creating a new column with timestamp format. Dropping original time_field. Converting spark dataframe into pandas dataframe. Setting 'timestamp' as dataframe index. Sorting new index
    df = dataframe.dropna(how='any',subset = time_field).withColumn('timestamp',F.to_utc_timestamp(time_field,tz = 'UTC')).drop(time_field).toPandas().set_index('timestamp').sort_index()
    return df # This functions returns a pandas dataframe
예제 #2
0
def to_timestamp(obj, column, tzone='UTC'):
    f = F.col(column)
    datecol_type = obj.select(column).dtypes[0][1]

    if datecol_type not in ['timestamp', 'date']:
        f = F.to_timestamp(f)

    if tzone != 'UTC':
        f = F.to_utc_timestamp(f, tzone)

    return f
예제 #3
0
def clean_IOT_data(iot_df: DataFrame) -> DataFrame:
  '''Clean the IOT data
  '''
  # convert unix timestamp to timestamp
  iot_df = iot_df.withColumn('time', 
                             f.to_utc_timestamp(f.from_unixtime(f.col('timestamp')/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam')
                            )
  iot_df = iot_df.drop(*['cca2', 'cn', 'timestamp', 'scale'])
  iot_df = (iot_df.withColumn('Date', f.to_date('time'))
            .withColumn('Month', f.month('Date'))
            .withColumn('Year', f.year('Date'))
           )
  return iot_df
    def local_to_utc(self, localTimeZone, subset=None):
        if subset is not None:
            tsCol = (
                x for x in self.inputSchema
                if str(x.dataType) == "TimestampType" and x.name in subset)
        else:
            tsCol = (x for x in self.inputSchema
                     if str(x.dataType) == "TimestampType")

        for x in tsCol:
            self.inputDf = self.inputDf.withColumn(
                x.name, to_utc_timestamp(col(x.name), localTimeZone))
        return self.inputDf
    def changeTimezone(self, fromTimezone, toTimezone, subset=None):
        if subset is not None:
            tsCol = (
                x for x in self.inputSchema
                if str(x.dataType) == "TimestampType" and x.name in subset)
        else:
            tsCol = (x for x in self.inputSchema
                     if str(x.dataType) == "TimestampType")

        for x in tsCol:
            self.inputDf = self.inputDf.withColumn(
                x.name, to_utc_timestamp(col(x.name), fromTimezone))
            self.inputDf = self.inputDf.withColumn(
                x.name, from_utc_timestamp(col(x.name), toTimezone))
        return self.inputDf
예제 #6
0
    def clean_data(self, conversions, attribution):

        #Handle missing values
        attribution = attribution.dropna(how='any')
        conversions = conversions.dropna(how='any')
        Data = attribution.join(conversions, on=['Conv_ID'], how='inner')

        # Change conv_date to datetime format
        timeFmt = "yyyy-MM-dd"
        Data = Data.withColumn(
            'Conv_Date',
            to_utc_timestamp(
                unix_timestamp(col('Conv_Date'), timeFmt).cast('timestamp'),
                'UTC'))

        return Data
예제 #7
0
    def test_generate_select_expression_for_TimestampMonth_spark2(
            self, input_value, value, spark_session, spark_context):
        source_key, name = "day_of_birth", "birthday"
        input_df = get_input_df(spark_session, spark_context, source_key,
                                input_value)
        input_df = input_df.withColumn(
            source_key,
            F.to_utc_timestamp(input_df["attributes"]["data"][source_key],
                               "yyyy-MM-dd"))
        result_column = custom_types._generate_select_expression_for_TimestampMonth(
            source_column=input_df[source_key], name=name)
        output_df = input_df.select(result_column)

        assert output_df.schema.fieldNames() == [name], "Renaming of column"
        assert output_df.schema[name].dataType.typeName(
        ) == "timestamp", "Casting of column"
        output_value = output_df.first()[name]
        if output_value:
            output_value = datetime.date.strftime(output_value,
                                                  format("%Y-%m-%d"))
        else:
            output_value = None
        assert output_value == value, "Processing of column value"
예제 #8
0
                                      df_nycflights.minute)) 

df_nycflights.show()

df_nycflights = df_nycflights. \
                withColumn('timestamp',
                            concat_ws(' ',
                                      df_nycflights.date,
                                      df_nycflights.time)) 

df_nycflights.show()

'''Next we will convert the StringType to TimeStampType'''
df_nycflights = df_nycflights. \
                withColumn('timestamp',
                            to_utc_timestamp(df_nycflights.timestamp,'GMT'))

df_nycflights.show()

'''Next we make the column timestamp as first column'''
'''Note columns will return all column name'''
df_nycflights = df_nycflights. \
                select(df_nycflights.columns[-1:] + df_nycflights.columns[0:-1])

df_nycflights.show()


'''Now we drop year,month,day,hour,minute,date,time columns as we will again try to create these from timestamp column that we created'''
df_nycflights = df_nycflights. \
                drop('year'). \
                drop('month'). \
예제 #9
0
from random import randint
from pyspark.sql import functions as F

SCHEMA_VERSION = "1"
INPUT_FILE_NAME = "schema_v{nr}.json".format(nr=SCHEMA_VERSION)
DATE_COLUMNS_TO_CONVERT = [("attributes.birthday", "birthday")]

rdd_text = spark.sparkContext.textFile(INPUT_FILE_NAME)
rdd_text.coalesce(2).saveAsTextFile(
    "schema_v{nr}/textFiles".format(nr=SCHEMA_VERSION))

rdd_seq = rdd_text.map(lambda x:
                       (randint(1000, 100000), bytearray(x, "utf-8")))
rdd_seq.coalesce(2).saveAsSequenceFile(
    "schema_v{nr}/sequenceFiles".format(nr=SCHEMA_VERSION))

df = spark.read.json(rdd_text)
for input_col, output_col in DATE_COLUMNS_TO_CONVERT:
    df = df.withColumn(
        output_col,
        F.to_utc_timestamp(timestamp=F.to_timestamp(df[input_col]),
                           tz="Europe/Vienna"))
df.coalesce(2).write.parquet(
    "schema_v{nr}/parquetFiles".format(nr=SCHEMA_VERSION))
def main():
    """
    Get observations near locations from SmartMet Server
    
    Data start and end time and timestep is fetched from the
    data. Dataset is assumed coherent in means of time and
    locations. I.e. timestep is assumed to be constant between start
    and end time. 
    """
    log1=logging.getLogger("driver")


    output_directory = 'gs://{}/hadoop/tmp/bigquery/pyspark_output'.format(bucket)
    output_files = output_directory + '/part-*'

    # The trains stations data in stations.json
    # The type of trains and their delay in gratu_a_b_2010-14.csv

    JSON_PATH="gs://trains-data/data/stations.json"
    CSV_PATH="gs://trains-data/data/full/gratu_a_b_2010-14.csv"

    train_stations_df = spark.read \
        .json(JSON_PATH)

    # parameters for weather data to be fetched from Smartmet server
    params, names = read_parameters('parameters_shorten.txt')

    # base URL for the surface data
    baseurl = 'http://data.fmi.fi/fmi-apikey/9fdf9977-5d8f-4a1f-9800-d80a007579c9/timeseries?format=ascii&separator=,&producer=fmi&tz=local&timeformat=xml&timestep=60&numberofstations=5&maxdistance=100000&param={params}'.format(params=','.join(params))


    urlist= train_stations_df.rdd.flatMap(lambda x : ['%s#%s&latlons=%s,%s' % (x.stationShortCode,baseurl,x.latitude,x.longitude)]).repartition(16)


    data = urlist.map(read_from_URL)\
                 .filter(lambda x: x != -1)\
                 .flatMap(lambda x:x.splitlines())\
                 .map(lambda x: x.split(','))

    newColumns=names+["trainstation"]
    schemaString = ' '.join(str(x) for x in newColumns)

    fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
    schema = StructType(fields)

    # Apply the schema to the RDD.
    station_weather_df = spark.createDataFrame(data, schema)
    station_weather_df = station_weather_df.withColumn("time", to_utc_timestamp(station_weather_df.time, "%Y-%m-%dT%H"))

    # calculate max_precipitation 3h and max_precipitation6h
    col="max_precipitation1h"

    # to change the "no precipiation values" -1.0 to 0.0
    station_weather_df = station_weather_df.withColumn(col, f.when(station_weather_df[col] == -1.0, 0.0).otherwise(station_weather_df[col]))


    # using window functions to calculate the precipitation for the
    # previous 3 hours and 6 hours
    w3 = w.partitionBy("trainstation")\
          .orderBy(station_weather_df["time"])\
          .rowsBetween(-2,0)
        
    station_weather_df =station_weather_df.withColumn("max_precipitation3h",f.sum("max_precipitation1h").over(w3))


    w6 = w.partitionBy("trainstation")\
          .orderBy(station_weather_df["time"])\
          .rowsBetween(-5,0)
    
    station_weather_df =station_weather_df.withColumn("max_precipitation6h",f.sum("max_precipitation1h").over(w6))

    # making the surface observation dataframe

    cols = station_weather_df.columns  # list of all columns
    for col in cols:
        station_weather_df = station_weather_df.fillna({col:"-99"})
        station_weather_df = station_weather_df.withColumn(col, f.when(station_weather_df[col].isin("null", "nan", "NaN", "NULL"),"-99").otherwise(station_weather_df[col]))

    log1.info("Retrieved surface data")

   
   
    ## Get flash data

    baseurl = 'http://data.fmi.fi/fmi-apikey/9fdf9977-5d8f-4a1f-9800-d80a007579c9/timeseries?param=time,peak_current&producer=flash&tz=local&timeformat=xml&format=ascii&separator=,'
    urlist= train_stations_df.rdd.flatMap(lambda x : ['%s#%s&latlon=%s,%s:30' % (x.stationShortCode,baseurl,x.latitude,x.longitude)])
   
    data = urlist.map(getFlash)\
            .filter(lambda x: x != -1)\
            .flatMap(lambda x:x.splitlines())\
            .map(lambda x: x.split(','))


    schemaString = 'time peakcurrent trainstation'

    fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
    schema = StructType(fields)

    flash_df =  spark.createDataFrame(data, schema)
    flash_df = flash_df.withColumn("time", to_utc_timestamp(flash_df.time, "%Y%m%dT%HMS"))

    # find the count of flashes in each hour
    extended = (flash_df
                .withColumn("date", f.col("time").cast("date"))
                .withColumn("hour", f.hour(f.col("time"))))

    flash_aggs = extended.groupBy("trainstation", "date", "hour").count()

    flash_aggs = flash_aggs.withColumn('time', f.concat(f.col("date"), f.lit("T"), f.col("hour")))

    flash =flash_aggs.withColumn('time',to_utc_timestamp(flash_aggs.time,"%Y-%m-%dT%H")).select("time", f.col("count").alias("flashcount"),"trainstation")

    log1.info("Retrieved flash data")
    
    # Combining surface and flash data

    cond = [flash.time == station_weather_df.time, flash.trainstation == station_weather_df.trainstation ]
    
    station_weather_flash_df = station_weather_df.alias('a').join(flash.alias('b'),cond, 'outer').select('a.*', 'b.flashcount').fillna({'flashcount':'0'}) 

    # Reading the train type and delay data
    df = spark.read \
            .csv(CSV_PATH)

    # combining the date and time columns and selecting the relevant columns
    df = df.withColumn('t', f.concat(f.col("_c0"), f.lit("T"), f.col("_c1"))).select("t","_c3", "_c4", "_c9", "_c7", "_c5")


    # converting the time to utc timestamp and adding 1 hour
    df = df.withColumn('t',to_utc_timestamp(df.t,"%Y-%m-%dT%H") + f.expr('INTERVAL 1 HOUR'))

    trains_df = df.select(f.col("t").alias("time"),f.col("_c3").alias("trainstation"), f.col("_c4").alias("train_type"), f.col("_c9").alias("train_count"), f.col("_c7").alias("total_delay"), f.col("_c5").alias("delay"))

    # Combining the weather data both surface and flash with
    #he train delay and type data
    cond = [trains_df.time == station_weather_flash_df.time, trains_df.trainstation == station_weather_flash_df.trainstation ]

    trains_station_weather_flash_delay_df = trains_df.join(station_weather_flash_df, cond).drop(station_weather_flash_df.time).drop(station_weather_flash_df.trainstation)

    log1.info("Created the dataframe with train delay and weather observations Finished!\n")

    # Saving the data to BigQuery

    (trains_station_weather_flash_delay_df
     .write.format('json').save(output_directory))

    # Shell out to bq CLI to perform BigQuery import.
    subprocess.check_call(
        'bq load --source_format NEWLINE_DELIMITED_JSON '
        '--replace '
        '--autodetect '
        '{dataset}.{table} {files}'.format(
            dataset=output_dataset, table=output_table, files=output_files
        ).split())

    # Manually clean up the staging_directories, otherwise BigQuery
    # files will remain indefinitely.
    output_path = spark._jvm.org.apache.hadoop.fs.Path(output_directory)
    output_path.getFileSystem(spark._jsc.hadoopConfiguration()).delete(
        output_path, True)

    
    elapsed_time = time.time() - start_time
    log1.info("Elapsed time to retreive train delay and observation data and save to bq {:10.3f}".format(elapsed_time))
# create spark session
my_spark = SparkSession\
    .builder\
    .appName("SpaceBar")\
    .config("spark.mongodb.input.uri", input_uri)\
    .config("spark.mongodb.output.uri", output_uri)\
    .config('spark.jars.packages','org.mongodb.spark:mongo-spark-connector_2.12:2.4.2')\
    .getOrCreate()

# read data from mongodb
df = my_spark.read.format('com.mongodb.spark.sql.DefaultSource').load()
df = df.withColumn('id' , monotonically_increasing_id())
df = df.select('id', 'lang', 'timestamp_ms', 'source')

#convert timestamp
df2 = df.withColumn('time', F.to_utc_timestamp(F.from_unixtime(F.col("timestamp_ms")/1000,'yyyy-MM-dd HH:mm:ss'),'UTC'))

#select tool from source
df3 = df2.withColumn('tool', when(col('source').contains('tweetdeck'),'Tweetdeck')
                                 .when(col("source").contains("mobile"),"mobile")
				 .when(col("source").contains("android"),"android")
				 .when(col("source").contains("iPhone"),"iPhone")
				 .when(col("source").contains("Android"),"android")
		     		 .when(col("source").contains("tweetbot"),"Tweetbot")
				 .when(col("source").contains("iPad"),"iPad")
                                 .otherwise("Unknown"))
df4 = df3.select('time', 'timestamp_ms', 'lang',  'tool')
df4.show()

#export to csv
df4.toPandas().to_csv('data.csv')
예제 #12
0
    savings_accountsDF = spark.read.json('data/savings_accounts/*.json')
    savings_accountsDF.createOrReplaceTempView("savings_accountsView")
    spark.sql("select * from savings_accountsView").show()

    # create denormalized joined table view
    denormalized = spark.sql(
        "SELECT A.ts as A_ts, A.id as A_id, A.op as A_op, A.data as A_data, A.set as A_set, B.ts as B_ts, B.id as B_id, B.op as B_op, B.data as B_data, B.set as B_set, C.ts as C_ts, C.id as C_id, C.op as C_op, C.data as C_data, C.set as C_set FROM accountsView A FULL JOIN cardsView B ON B.ts = A.ts FULL JOIN savings_accountsView C ON C.ts = A.ts"
    ).cache()
    denormalized.createOrReplaceTempView("denormalizedView")
    spark.sql("select * from denormalizedView").show(truncate=False)

    # convert millis to datetime
    denormalized = denormalized.withColumn(
        "A_datetime",
        functions.to_utc_timestamp(
            functions.from_unixtime(
                functions.col("A_ts") / 1000, 'yyyy-MM-dd HH:mm:ss'), 'UTC'))
    denormalized = denormalized.withColumn(
        "B_datetime",
        functions.to_utc_timestamp(
            functions.from_unixtime(
                functions.col("B_ts") / 1000, 'yyyy-MM-dd HH:mm:ss'), 'UTC'))
    denormalized = denormalized.withColumn(
        "C_datetime",
        functions.to_utc_timestamp(
            functions.from_unixtime(
                functions.col("C_ts") / 1000, 'yyyy-MM-dd HH:mm:ss'), 'UTC'))

    # get transactions
    denormalized.select(
        "B_datetime", "B_set.credit_used", "C_datetime",
def to_gmt(df):

    return df.withColumn("timestamp",
                         F.to_utc_timestamp("timestamp", df.time_zone))
예제 #14
0
def my_count(ds):
    ds.agg(*[count(c).alias(c) for c in ds.columns]).show()


my_count(df)

df = df.dropna(how='any')
my_count(df)

# dealwith date
from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, lit, col

timeFmt = "MM/dd/yy HH:mm"
df = df.withColumn('NewInvoiceDate',
                   to_utc_timestamp(unix_timestamp(col('InvoiceDate'), timeFmt).cast('timestamp'), 'utc'))

# calculate total price
from pyspark.sql.functions import round

df = df.withColumn('TotalPrice', round(df.Quantity * df.UnitPrice, 2))

# calculate the time difference
from pyspark.sql.functions import min, max, sum, datediff

date_max = df.select(max('NewInvoiceDate')).toPandas()
current = to_utc_timestamp(unix_timestamp(lit(str(date_max.iloc[0][0])), \
                                          'yy-MM-dd HH:mm').cast('timestamp'), 'UTC')

# Calculatre Duration
df = df.withColumn('Duration', datediff(lit(current), 'NewInvoiceDate'))
예제 #15
0
df.printSchema()

# Visualizando o aspecto dos dados

# In[17]:

df.show(5, truncate=False)

# Conversão de colunas [dropoff_datetime, pickup_datetime] do tipo String para tipo TimeStamp.

# In[18]:

#DataFrame Convertido (dfc)
dfc = df.withColumn(
    'dropoff_datetime',
    F.to_utc_timestamp('dropoff_datetime',
                       "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn(
                           'pickup_datetime',
                           F.to_utc_timestamp('pickup_datetime',
                                              "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
dfc.printSchema()

# Visualizando o aspecto dos dados, em especial os campos dataTime

# In[19]:

dfc.show(5, False)

# Criando uma view chamada trip.

# In[20]:
예제 #16
0
            return word
        else:
            return "nonTag"

    ## Converting date string format
    def getDate(x):
        if x is not None:
            return str(
                datetime.strptime(x, '%a %b %d %H:%M:%S +0000 %Y').replace(
                    tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))

    date_fn = udf(getDate, StringType())

    ## Converting datatype in spark dataframe
    df = df.withColumn("created_at",
                       to_utc_timestamp(date_fn("created_at"), "UTC"))

    words = df.select(
        explode(split(df.text, " ")).alias("word"),
        df.created_at.alias("timestamp"))
    extract_tags_udf = udf(extract_tags, StringType())
    resultDF = words.withColumn("tags", extract_tags_udf(words.word))\
                    .withColumn("timestamp", words.timestamp)

    windowedCounts = resultDF.where(resultDF.tags != "nonTag").groupBy(
        window(words.timestamp, window_length, sliding_interval),
        words.word).count().orderBy('window')

    query = windowedCounts.writeStream\
                      .outputMode("complete")\
                      .format("console")\
예제 #17
0
        data = data.withColumn('label', col(labelCol))
        return data.select('features', 'label')
    elif indexCol and not labelCol:
        # for unsupervised learning
        return data.select(indexCol, 'features')
    elif not indexCol and not labelCol:
        # for unsupervised learning
        return data.select('features')


# dealwith the invoiceDate
from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, lit, datediff, col

timeFmt = "MM/dd/yy HH:mm"
ds = ds.withColumn('NewInvoiceDate',
                   to_utc_timestamp(unix_timestamp(col('InvoiceDate'), timeFmt).cast('timestamp'), 'utc'))

# analyze missing values counts
import pyspark.sql.functions as F

df.select([F.count(F.when(F.isnull(c) | F.isnan(c), c)).alias(c) for c in df.columns])

# remove blank string
df = df.filter("colName != ''")

# calculate outlier, drop outlier
cols = numeric_features
bounds = {}
for col in cols:
    quantiles = df.approxQuantile(col,[0.25,0.75], 0.05)
    IQR = quantiles[1] - quantiles[0]
예제 #18
0
df_history = df_history.join(organizations_dropcol, df_history.encounters_provider == organizations_dropcol.organizations_id,how='left')
df_history = df_history.join(providers_dropcol, df_history.organizations_id == providers_dropcol.providers_organization,how='left')
## Calculate percent of Null Values
print ('Null values percentage in each column:')
print (null_perc(df_history))

## Dropping duplicate value columns
drop_column_list = ['procedures_patient','medications_patient','immunizations_encounter',
                       'conditions_encounter','procedures_encounter','medications_encounter','imaging_studies_encounter',
                       'providers_organization','medications_payer_coverage',
                       'conditions_patient','conditions_code','procedures_reasondescription','encounters_provider','organizations_revenue','medications_base_cost','medications_totalcost','procedures_reasoncode']
df_history = df_history.select([column for column in df_history.columns if column not in drop_column_list]) 

df_history = df_history.filter(df_history.encounters_encounterclass.isin('inpatient','ambulatory','urgentcare','outpatient','emergency'))
## Convert data type from string to timestamp
df_history = df_history.withColumn("encounters_stop", to_utc_timestamp("encounters_stop", "GMT"))
df_history = df_history.withColumn("encounters_start", to_utc_timestamp("encounters_start", "GMT"))
print('encounters time duration values')
df_history.show(truncate=100)


##Subtract encounter stop and encounter start to calculate admission duration for the patient
df_history = df_history.withColumn("admission_duration", unix_timestamp("encounters_stop")-unix_timestamp("encounters_start"))
#df_history = df_history.withColumn("admission_duration", col("encounters_stop")-col("encounters_start"))
df_history.printSchema()
print('admission duration values')
df_history.show(truncate=100)


##Replace empty strings with required values
df_history = df_history.replace('','NM','patient_marital')
예제 #19
0
iot_df.filter(f.col('cca3').isNull() | f.col('device_id').isNull()).show() # no nulls for country or id

# COMMAND ----------

iot_df[['device_id']].distinct().count() # devices are the same as the number of rows, so we only have one observation per device

# COMMAND ----------

# MAGIC %md
# MAGIC Note the many different ways in which we can create the timestamp column. Although they look different, their essence is the same, namely, they specify an expression. **A column in Spark is just the specification of an expression for a computation to be performed for each row when an action is triggered**.

# COMMAND ----------

# for instance, the following expression specifies a column
f.to_utc_timestamp(f.from_unixtime(iot_df['timestamp']/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam')

# COMMAND ----------

iot_df = iot_df.withColumn('time',f.to_utc_timestamp(f.from_unixtime(f.col('timestamp')/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam'))
  
  # these also work, and are wholly equivalent:
  # iot_df.select(f.expr('*'), f.to_utc_timestamp(f.from_unixtime(iot_df['timestamp']/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam').alias('time')
  # iot_df.select(f.expr('*'), f.expr("to_utc_timestamp(from_unixtime(timestamp/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam') AS time"))
  # iot_df.selectExpr('*', "to_utc_timestamp(from_unixtime(timestamp/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam') AS time")
  
  # HOWEVER, this does not work, as dataframes are immutable. While the above specify a (lazy) transformation,
  # the command below tries to make an assignment:
  # iot_df['timestamp'] = f.to_utc_timestamp(f.from_unixtime(iot_df['timestamp']/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam')
  # the right hand side of the assignment is fine; it's the assignment that's the problem
예제 #20
0
ax.set_xticklabels(df.columns, rotation='vertical',fontsize=7)
plt.title('Missing Value Occurrence')
plt.show()

## Delete rows with null value
from pyspark.sql.functions import count
def data_count(inp):
    inp.agg(*[count(i).alias(i) for i in inp.columns]).show()  
data_count(df) # raw data
df_new = df.dropna(how='any') 
data_count(df_new) # after we drop rows with null values

## Change datetime format
from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, lit, datediff, col 
time_format = "MM/dd/yy HH:mm"
df_new = df_new.withColumn('NewInvoiceDate', to_utc_timestamp(unix_timestamp(col('InvoiceDate'),time_format).cast('timestamp'),'UTC'))  
df_new.show(5)

## Calculate total price and create the column
from pyspark.sql.functions import round
df_new = df_new.withColumn('TotalPrice', round( df.Quantity * df.UnitPrice, 2 ))

## Calculate time diff
from pyspark.sql.functions import mean, min, max, sum
date_max = df_new.select(max('NewInvoiceDate')).toPandas()

## Calculate duration
df_new = df_new.withColumn('Duration', datediff(lit(date_max.iloc[0][0]), 'NewInvoiceDate')) 
df_new.show(5)

## Calculate rfm
예제 #21
0
 def _add_times(self, iot_df: DataFrame) -> DataFrame:
     iot_df = iot_df.withColumn('time', 
                          f.to_utc_timestamp(f.from_unixtime(f.col('timestamp')/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam')
                         )
예제 #22
0
                         trim(col("wind_direction")).cast("int").alias("wind_direction"))\
    .withColumn("ID", regexp_replace("ID", "^0+", ""))\
    .repartition(30, "ob_time")

if debug:
    wx_df.printSchema()



# Observation time in rainfall data is recorded for rainfall in last 1 hour
# To offset the hour difference, we are using UTC+1 as timezone such that the time in dataframe will be one hour later
rain_df = rainhrly_df.select(trim(col("src_id")).alias("SRC_ID"),
                             trim(col("id_type")).alias("ID_T"),
                             trim(col("met_domain_name")).alias("MET_DOMA"),
                             trim(col("id")).alias("ID"),
                             to_utc_timestamp(
                                 trim(col("ob_end_time")), "GMT+1").alias("ob_time"),
                             trim(col("prcp_amt")).cast(
                                 "float").alias("prcp_amt"),
                             trim(col("prcp_dur")).cast("float").alias("prcp_dur"))\
    .withColumn("ID", regexp_replace("ID", "^0+", ""))\
    .filter("MET_DOMA = 'SREW'")\
    .repartition(30, "ob_time")

if debug:
    rain_df.printSchema()
#    rain_df.show()


combined_weather_df = wx_df.join(rain_df, ["SRC_ID", "ob_time"], "inner")\
                           .select(["ob_time", "air_temperature", "wind_speed", "wind_direction", "prcp_amt", "prcp_dur",
                                    year("ob_time").alias("year"),