示例#1
0
文件: datagrid.py 项目: ribonj/lsir
 def index(self):
     """
     Add a unique (monotonically increasing) row ID to the DataGrid
     """
     self.dataframe = self.dataframe.withColumn(
         'GridID', 
         monotonically_increasing_id())

# COMMAND ----------

df.stat.crosstab("StockCode", "Quantity").show()


# COMMAND ----------

df.stat.freqItems(["StockCode", "Quantity"]).show()


# COMMAND ----------

from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)


# COMMAND ----------

from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show()


# COMMAND ----------

from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)
示例#3
0
 def attach_sequence_column(sdf, column_name):
     scols = [scol_for(sdf, column) for column in sdf.columns]
     sequential_index = (F.row_number().over(
         Window.orderBy(F.monotonically_increasing_id())).cast("long") - 1)
     return sdf.select(sequential_index.alias(column_name), *scols)
示例#4
0
    def __init__(
        self,
        spark_frame: spark.DataFrame,
        index_map: Optional[Dict[str, Optional[Tuple]]],
        column_labels: Optional[List[Tuple]] = None,
        data_spark_columns: Optional[List[spark.Column]] = None,
        column_label_names: Optional[List[Optional[Tuple]]] = None,
    ) -> None:
        """
        Create a new internal immutable DataFrame to manage Spark DataFrame, column fields and
        index fields and names.

        :param spark_frame: Spark DataFrame to be managed.
        :param index_map: dictionary of string pairs
                           Each pair holds the index field name which exists in Spark fields,
                           and the index name.
        :param column_labels: list of tuples with the same length
                              The multi-level values in the tuples.
        :param data_spark_columns: list of Spark Column
                                   Spark Columns to appear as columns. If spark_column is not None,
                                   this argument is ignored, otherwise if this is None, calculated
                                   from spark_frame.
        :param column_label_names: Names for each of the column index levels.

        See the examples below to refer what each parameter means.

        >>> column_labels = pd.MultiIndex.from_tuples(
        ...     [('a', 'x'), ('a', 'y'), ('b', 'z')], names=["column_labels_a", "column_labels_b"])
        >>> row_index = pd.MultiIndex.from_tuples(
        ...     [('foo', 'bar'), ('foo', 'bar'), ('zoo', 'bar')],
        ...     names=["row_index_a", "row_index_b"])
        >>> kdf = ks.DataFrame(
        ...     [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=row_index, columns=column_labels)
        >>> kdf.set_index(('a', 'x'), append=True, inplace=True)
        >>> kdf  # doctest: +NORMALIZE_WHITESPACE
        column_labels_a                  a  b
        column_labels_b                  y  z
        row_index_a row_index_b (a, x)
        foo         bar         1       2  3
                                4       5  6
        zoo         bar         7       8  9

        >>> internal = kdf._internal

        >>> internal._sdf.show()  # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
        +-----------------+-----------------+------+------+------+...
        |__index_level_0__|__index_level_1__|(a, x)|(a, y)|(b, z)|...
        +-----------------+-----------------+------+------+------+...
        |              foo|              bar|     1|     2|     3|...
        |              foo|              bar|     4|     5|     6|...
        |              zoo|              bar|     7|     8|     9|...
        +-----------------+-----------------+------+------+------+...

        >>> internal._index_map  # doctest: +NORMALIZE_WHITESPACE
        OrderedDict([('__index_level_0__', ('row_index_a',)),
         ('__index_level_1__', ('row_index_b',)),
         ('(a, x)', ('a', 'x'))])

        >>> internal._column_labels
        [('a', 'y'), ('b', 'z')]

        >>> internal._data_spark_columns
        [Column<b'(a, y)'>, Column<b'(b, z)'>]

        >>> internal._column_label_names
        [('column_labels_a',), ('column_labels_b',)]
        """

        assert isinstance(spark_frame, spark.DataFrame)
        assert not spark_frame.isStreaming, "Koalas does not support Structured Streaming."

        if index_map is None:
            assert not any(
                SPARK_INDEX_NAME_PATTERN.match(name)
                for name in spark_frame.columns
            ), ("Index columns should not appear in columns of the Spark DataFrame. Avoid "
                "index column names [%s]." % SPARK_INDEX_NAME_PATTERN)

            if data_spark_columns is not None:
                spark_frame = spark_frame.select(data_spark_columns)

            # Create default index.
            spark_frame = InternalFrame.attach_default_index(spark_frame)
            index_map = OrderedDict({SPARK_DEFAULT_INDEX_NAME: None})

            if data_spark_columns is not None:
                data_spark_columns = [
                    scol_for(spark_frame, col) for col in spark_frame.columns
                    if col != SPARK_DEFAULT_INDEX_NAME
                ]

        if NATURAL_ORDER_COLUMN_NAME not in spark_frame.columns:
            spark_frame = spark_frame.withColumn(
                NATURAL_ORDER_COLUMN_NAME, F.monotonically_increasing_id())

        assert isinstance(index_map, OrderedDict), index_map
        assert all(
            isinstance(index_field, str)
            and is_name_like_tuple(index_name, check_type=True)
            for index_field, index_name in index_map.items()), index_map
        assert data_spark_columns is None or all(
            isinstance(scol, spark.Column) for scol in data_spark_columns)

        self._sdf = spark_frame  # type: spark.DataFrame
        self._index_map = index_map  # type: Dict[str, Optional[Tuple]]

        if data_spark_columns is None:
            index_columns = set(index_column
                                for index_column in self._index_map)
            self._data_spark_columns = [
                scol_for(spark_frame, col) for col in spark_frame.columns
                if col not in index_columns and col not in HIDDEN_COLUMNS
            ]
        else:
            self._data_spark_columns = data_spark_columns

        if column_labels is None:
            self._column_labels = [
                (col, )
                for col in spark_frame.select(self._data_spark_columns).columns
            ]  # type: List[Tuple]
        else:
            assert len(column_labels) == len(self._data_spark_columns), (
                len(column_labels),
                len(self._data_spark_columns),
            )
            if len(column_labels) == 1:
                column_label = column_labels[0]
                assert is_name_like_tuple(column_label,
                                          check_type=True), column_label
            else:
                assert all(
                    is_name_like_tuple(column_label, check_type=True)
                    for column_label in column_labels), column_labels
                assert len(set(len(label)
                               for label in column_labels)) <= 1, column_labels
            self._column_labels = column_labels

        if column_label_names is None:
            self._column_label_names = [None] * column_labels_level(
                self._column_labels)  # type: List[Optional[Tuple]]
        else:
            if len(self._column_labels) > 0:
                assert len(column_label_names) == column_labels_level(
                    self._column_labels), (
                        len(column_label_names),
                        column_labels_level(self._column_labels),
                    )
            else:
                assert len(column_label_names) > 0, len(column_label_names)
            assert all(
                is_name_like_tuple(column_label_name, check_type=True) for
                column_label_name in column_label_names), column_label_names
            self._column_label_names = column_label_names
示例#5
0
文件: etl.py 项目: dbdeep/Sparkify
def process_log_data(spark, input_data, output_data):
    """
    pulls log data from udacity s3 bucket, and writes to another bucket as a parquet file
    :rtype: object
    """
    input_song_data = input_data + 'song_data'
    input_log_data = input_data + 'log_data'

    # get filepath to log data file
    log_data = get_files('data/log_data')

    # read log data file
    df = spark.read.json(input_log_data).drop_duplicates()
    df.printSchema()
    # filter by actions for song plays
    df = df[df['page'] == 'NextSong']

    # USER TABLE
    # extract columns for users table
    df.createOrReplaceTempView("users_table")
    columns = ['userId', 'firstName', 'lastName', 'gender', 'level']

    # write users table to parquet files
    users_table = spark.sql("""
    SELECT userId, firstName, lastName, gender, level
    FROM users_table
    """).toDF(*columns)

    users_table.write.parquet(os.path.join(output_data, "users.parquet"),
                              "overwrite")

    # TIME TABLE
    # create timestamp column from original timestamp column
    get_start_time = udf(lambda x: datetime.fromtimestamp(x / 1000.0).strftime(
        '%Y-%m-%d %H:%M:%S'))
    get_hour = udf(lambda x: datetime.fromtimestamp(x / 1000.0).hour)
    get_day = udf(lambda x: datetime.fromtimestamp(x / 1000.0).day)
    get_week = udf(lambda x: datetime.fromtimestamp(x / 1000.0).strftime('%W'))
    get_month = udf(lambda x: datetime.fromtimestamp(x / 1000.0).month)
    get_year = udf(lambda x: datetime.fromtimestamp(x / 1000.0).year)
    get_weekday = udf(
        lambda x: datetime.fromtimestamp(x / 1000.0).strftime('%A'))

    df = df.withColumn('start_time', get_start_time(df['ts']))
    df = df.withColumn('hour', get_hour(df['ts']))
    df = df.withColumn('day', get_day(df['ts']))
    df = df.withColumn('week', get_week(df['ts']))
    df = df.withColumn('month', get_month(df['ts']))
    df = df.withColumn('year', get_year(df['ts']))
    df = df.withColumn('week_day', get_weekday(df['ts']))

    df.createOrReplaceTempView("time_table")
    df.printSchema()

    columns = [
        'start_time', 'hour', 'day', 'week', 'month', 'year', 'week_day'
    ]

    # extract columns to create time table
    time_table = spark.sql("""
    SELECT start_time, hour, day, week, month, year, week_day
    FROM time_table
""").toDF(*columns)

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").parquet(
        os.path.join(output_data, "time.parquet"), "overwrite")

    # read in song data to use for songplays table

    # add autoincrement column
    df = df.withColumn('songplay_id', F.monotonically_increasing_id())
    df.createOrReplaceTempView("songplays_table")

    # get song df
    # song_data = get_files('data/song_data')
    song_data = input_song_data
    song_df = spark.read.json(song_data).drop_duplicates()
    song_df.createOrReplaceTempView("songs_table")

    columns = [
        'songplay_id', 'start_time', 'userId', 'level', 'sessionId',
        'location', 'userAgent', 'year', 'month', 'length', 'song_id',
        'artist_id', 'title', 'artist_name', 'duration'
    ]

    # extract columns to create time table
    songplays_table = spark.sql("""
            SELECT sp.songplay_id, sp.start_time, sp.userId, sp.level, sp.sessionId, sp.location, sp.userAgent, sp.year, 
            sp.month, sp.length, s.song_id, s.artist_id, s.title, s.artist_name, s.duration
            FROM songplays_table AS sp 
            JOIN songs_table AS s ON sp.song = s.title AND sp.artist = s.artist_name AND sp.length = s.duration
        """).toDF(*columns)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(
        os.path.join(output_data, "songplays.parquet"), "overwrite")
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA, BisectingKMeans
from pyspark.sql.functions import monotonically_increasing_id
import re

################################################################################################
#
#   Import Rawdata
#
################################################################################################

rawdata = spark.read.load("hdfs://sandbox.hortonworks.com:8020/tmp/airlines.csv", format="csv", header=True)
rawdata = rawdata.fillna({'review': ''})                               # Replace nulls with blank string
rawdata = rawdata.withColumn("uid", monotonically_increasing_id())     # Create Unique ID
rawdata = rawdata.withColumn("year_month", rawdata.date.substr(1,7))   # Generate YYYY-MM variable

# Show rawdata (as DataFrame)
rawdata.show(10)

# Print data types
for type in rawdata.dtypes:
    print type

target = rawdata.select(rawdata['rating'].cast(IntegerType()))
target.dtypes

################################################################################################
#
#   Text Pre-processing (consider using one or all of the following):