def index(self): """ Add a unique (monotonically increasing) row ID to the DataGrid """ self.dataframe = self.dataframe.withColumn( 'GridID', monotonically_increasing_id())
# COMMAND ---------- df.stat.crosstab("StockCode", "Quantity").show() # COMMAND ---------- df.stat.freqItems(["StockCode", "Quantity"]).show() # COMMAND ---------- from pyspark.sql.functions import monotonically_increasing_id df.select(monotonically_increasing_id()).show(2) # COMMAND ---------- from pyspark.sql.functions import initcap df.select(initcap(col("Description"))).show() # COMMAND ---------- from pyspark.sql.functions import lower, upper df.select(col("Description"), lower(col("Description")), upper(lower(col("Description")))).show(2)
def attach_sequence_column(sdf, column_name): scols = [scol_for(sdf, column) for column in sdf.columns] sequential_index = (F.row_number().over( Window.orderBy(F.monotonically_increasing_id())).cast("long") - 1) return sdf.select(sequential_index.alias(column_name), *scols)
def __init__( self, spark_frame: spark.DataFrame, index_map: Optional[Dict[str, Optional[Tuple]]], column_labels: Optional[List[Tuple]] = None, data_spark_columns: Optional[List[spark.Column]] = None, column_label_names: Optional[List[Optional[Tuple]]] = None, ) -> None: """ Create a new internal immutable DataFrame to manage Spark DataFrame, column fields and index fields and names. :param spark_frame: Spark DataFrame to be managed. :param index_map: dictionary of string pairs Each pair holds the index field name which exists in Spark fields, and the index name. :param column_labels: list of tuples with the same length The multi-level values in the tuples. :param data_spark_columns: list of Spark Column Spark Columns to appear as columns. If spark_column is not None, this argument is ignored, otherwise if this is None, calculated from spark_frame. :param column_label_names: Names for each of the column index levels. See the examples below to refer what each parameter means. >>> column_labels = pd.MultiIndex.from_tuples( ... [('a', 'x'), ('a', 'y'), ('b', 'z')], names=["column_labels_a", "column_labels_b"]) >>> row_index = pd.MultiIndex.from_tuples( ... [('foo', 'bar'), ('foo', 'bar'), ('zoo', 'bar')], ... names=["row_index_a", "row_index_b"]) >>> kdf = ks.DataFrame( ... [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=row_index, columns=column_labels) >>> kdf.set_index(('a', 'x'), append=True, inplace=True) >>> kdf # doctest: +NORMALIZE_WHITESPACE column_labels_a a b column_labels_b y z row_index_a row_index_b (a, x) foo bar 1 2 3 4 5 6 zoo bar 7 8 9 >>> internal = kdf._internal >>> internal._sdf.show() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +-----------------+-----------------+------+------+------+... |__index_level_0__|__index_level_1__|(a, x)|(a, y)|(b, z)|... +-----------------+-----------------+------+------+------+... | foo| bar| 1| 2| 3|... | foo| bar| 4| 5| 6|... | zoo| bar| 7| 8| 9|... +-----------------+-----------------+------+------+------+... >>> internal._index_map # doctest: +NORMALIZE_WHITESPACE OrderedDict([('__index_level_0__', ('row_index_a',)), ('__index_level_1__', ('row_index_b',)), ('(a, x)', ('a', 'x'))]) >>> internal._column_labels [('a', 'y'), ('b', 'z')] >>> internal._data_spark_columns [Column<b'(a, y)'>, Column<b'(b, z)'>] >>> internal._column_label_names [('column_labels_a',), ('column_labels_b',)] """ assert isinstance(spark_frame, spark.DataFrame) assert not spark_frame.isStreaming, "Koalas does not support Structured Streaming." if index_map is None: assert not any( SPARK_INDEX_NAME_PATTERN.match(name) for name in spark_frame.columns ), ("Index columns should not appear in columns of the Spark DataFrame. Avoid " "index column names [%s]." % SPARK_INDEX_NAME_PATTERN) if data_spark_columns is not None: spark_frame = spark_frame.select(data_spark_columns) # Create default index. spark_frame = InternalFrame.attach_default_index(spark_frame) index_map = OrderedDict({SPARK_DEFAULT_INDEX_NAME: None}) if data_spark_columns is not None: data_spark_columns = [ scol_for(spark_frame, col) for col in spark_frame.columns if col != SPARK_DEFAULT_INDEX_NAME ] if NATURAL_ORDER_COLUMN_NAME not in spark_frame.columns: spark_frame = spark_frame.withColumn( NATURAL_ORDER_COLUMN_NAME, F.monotonically_increasing_id()) assert isinstance(index_map, OrderedDict), index_map assert all( isinstance(index_field, str) and is_name_like_tuple(index_name, check_type=True) for index_field, index_name in index_map.items()), index_map assert data_spark_columns is None or all( isinstance(scol, spark.Column) for scol in data_spark_columns) self._sdf = spark_frame # type: spark.DataFrame self._index_map = index_map # type: Dict[str, Optional[Tuple]] if data_spark_columns is None: index_columns = set(index_column for index_column in self._index_map) self._data_spark_columns = [ scol_for(spark_frame, col) for col in spark_frame.columns if col not in index_columns and col not in HIDDEN_COLUMNS ] else: self._data_spark_columns = data_spark_columns if column_labels is None: self._column_labels = [ (col, ) for col in spark_frame.select(self._data_spark_columns).columns ] # type: List[Tuple] else: assert len(column_labels) == len(self._data_spark_columns), ( len(column_labels), len(self._data_spark_columns), ) if len(column_labels) == 1: column_label = column_labels[0] assert is_name_like_tuple(column_label, check_type=True), column_label else: assert all( is_name_like_tuple(column_label, check_type=True) for column_label in column_labels), column_labels assert len(set(len(label) for label in column_labels)) <= 1, column_labels self._column_labels = column_labels if column_label_names is None: self._column_label_names = [None] * column_labels_level( self._column_labels) # type: List[Optional[Tuple]] else: if len(self._column_labels) > 0: assert len(column_label_names) == column_labels_level( self._column_labels), ( len(column_label_names), column_labels_level(self._column_labels), ) else: assert len(column_label_names) > 0, len(column_label_names) assert all( is_name_like_tuple(column_label_name, check_type=True) for column_label_name in column_label_names), column_label_names self._column_label_names = column_label_names
def process_log_data(spark, input_data, output_data): """ pulls log data from udacity s3 bucket, and writes to another bucket as a parquet file :rtype: object """ input_song_data = input_data + 'song_data' input_log_data = input_data + 'log_data' # get filepath to log data file log_data = get_files('data/log_data') # read log data file df = spark.read.json(input_log_data).drop_duplicates() df.printSchema() # filter by actions for song plays df = df[df['page'] == 'NextSong'] # USER TABLE # extract columns for users table df.createOrReplaceTempView("users_table") columns = ['userId', 'firstName', 'lastName', 'gender', 'level'] # write users table to parquet files users_table = spark.sql(""" SELECT userId, firstName, lastName, gender, level FROM users_table """).toDF(*columns) users_table.write.parquet(os.path.join(output_data, "users.parquet"), "overwrite") # TIME TABLE # create timestamp column from original timestamp column get_start_time = udf(lambda x: datetime.fromtimestamp(x / 1000.0).strftime( '%Y-%m-%d %H:%M:%S')) get_hour = udf(lambda x: datetime.fromtimestamp(x / 1000.0).hour) get_day = udf(lambda x: datetime.fromtimestamp(x / 1000.0).day) get_week = udf(lambda x: datetime.fromtimestamp(x / 1000.0).strftime('%W')) get_month = udf(lambda x: datetime.fromtimestamp(x / 1000.0).month) get_year = udf(lambda x: datetime.fromtimestamp(x / 1000.0).year) get_weekday = udf( lambda x: datetime.fromtimestamp(x / 1000.0).strftime('%A')) df = df.withColumn('start_time', get_start_time(df['ts'])) df = df.withColumn('hour', get_hour(df['ts'])) df = df.withColumn('day', get_day(df['ts'])) df = df.withColumn('week', get_week(df['ts'])) df = df.withColumn('month', get_month(df['ts'])) df = df.withColumn('year', get_year(df['ts'])) df = df.withColumn('week_day', get_weekday(df['ts'])) df.createOrReplaceTempView("time_table") df.printSchema() columns = [ 'start_time', 'hour', 'day', 'week', 'month', 'year', 'week_day' ] # extract columns to create time table time_table = spark.sql(""" SELECT start_time, hour, day, week, month, year, week_day FROM time_table """).toDF(*columns) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet( os.path.join(output_data, "time.parquet"), "overwrite") # read in song data to use for songplays table # add autoincrement column df = df.withColumn('songplay_id', F.monotonically_increasing_id()) df.createOrReplaceTempView("songplays_table") # get song df # song_data = get_files('data/song_data') song_data = input_song_data song_df = spark.read.json(song_data).drop_duplicates() song_df.createOrReplaceTempView("songs_table") columns = [ 'songplay_id', 'start_time', 'userId', 'level', 'sessionId', 'location', 'userAgent', 'year', 'month', 'length', 'song_id', 'artist_id', 'title', 'artist_name', 'duration' ] # extract columns to create time table songplays_table = spark.sql(""" SELECT sp.songplay_id, sp.start_time, sp.userId, sp.level, sp.sessionId, sp.location, sp.userAgent, sp.year, sp.month, sp.length, s.song_id, s.artist_id, s.title, s.artist_name, s.duration FROM songplays_table AS sp JOIN songs_table AS s ON sp.song = s.title AND sp.artist = s.artist_name AND sp.length = s.duration """).toDF(*columns) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet( os.path.join(output_data, "songplays.parquet"), "overwrite")
from pyspark.sql.types import * from pyspark.sql.functions import * from pyspark.ml.linalg import Vectors, SparseVector from pyspark.ml.clustering import LDA, BisectingKMeans from pyspark.sql.functions import monotonically_increasing_id import re ################################################################################################ # # Import Rawdata # ################################################################################################ rawdata = spark.read.load("hdfs://sandbox.hortonworks.com:8020/tmp/airlines.csv", format="csv", header=True) rawdata = rawdata.fillna({'review': ''}) # Replace nulls with blank string rawdata = rawdata.withColumn("uid", monotonically_increasing_id()) # Create Unique ID rawdata = rawdata.withColumn("year_month", rawdata.date.substr(1,7)) # Generate YYYY-MM variable # Show rawdata (as DataFrame) rawdata.show(10) # Print data types for type in rawdata.dtypes: print type target = rawdata.select(rawdata['rating'].cast(IntegerType())) target.dtypes ################################################################################################ # # Text Pre-processing (consider using one or all of the following):