Exemplo n.º 1
0
    def test_np_spark_compat_frame(self):
        # Use randomly generated dataFrame
        pdf = pd.DataFrame(np.random.randint(-100,
                                             100,
                                             size=(np.random.randint(100), 2)),
                           columns=["a", "b"])
        pdf2 = pd.DataFrame(np.random.randint(-100,
                                              100,
                                              size=(len(pdf),
                                                    len(pdf.columns))),
                            columns=["a", "b"])
        kdf = ks.from_pandas(pdf)
        kdf2 = ks.from_pandas(pdf2)

        for np_name, spark_func in unary_np_spark_mappings.items():
            np_func = getattr(np, np_name)
            if np_name not in self.blacklist:
                try:
                    # unary ufunc
                    self.assert_eq(np_func(pdf), np_func(kdf), almost=True)
                except Exception as e:
                    raise AssertionError("Test in '%s' function was failed." %
                                         np_name) from e

        for np_name, spark_func in binary_np_spark_mappings.items():
            np_func = getattr(np, np_name)
            if np_name not in self.blacklist:
                try:
                    # binary ufunc
                    self.assert_eq(np_func(pdf, pdf),
                                   np_func(kdf, kdf),
                                   almost=True)
                    self.assert_eq(np_func(pdf, 1),
                                   np_func(kdf, 1),
                                   almost=True)
                except Exception as e:
                    raise AssertionError("Test in '%s' function was failed." %
                                         np_name) from e

        # Test only top 5 for now. 'compute.ops_on_diff_frames' option increases too much time.
        try:
            set_option("compute.ops_on_diff_frames", True)
            for np_name, spark_func in list(
                    binary_np_spark_mappings.items())[:5]:
                np_func = getattr(np, np_name)
                if np_name not in self.blacklist:
                    try:
                        # binary ufunc
                        self.assert_eq(
                            np_func(pdf, pdf2).sort_index(),
                            np_func(kdf, kdf2).sort_index(),
                            almost=True,
                        )

                    except Exception as e:
                        raise AssertionError(
                            "Test in '%s' function was failed." %
                            np_name) from e
        finally:
            reset_option("compute.ops_on_diff_frames")
Exemplo n.º 2
0
    def __init__(
        self,
        args,
        task_imls=None,
        input_format='dataframe',
        synonmys=['piano', 'rice', 'laptop'],
        output_pid_folder=False
    ):

        self.spark = spark_init(args.pid)
        if input_format == 'koalas':
            ks.set_option('compute.default_index_type', 'distributed')
        path_dict = {
            'review': args.review_filename,
            'product': args.product_filename,
            'product_processed': args.product_processed_filename,
            'ml_features_train': args.ml_features_train_filename,
            'ml_features_test': args.ml_features_test_filename
            }

        self.task_imls = task_imls
        self.tests = PA2Test(self.spark, args.test_results_root)
        if output_pid_folder:
            output_root = os.path.join(args.output_root, args.pid)
        else:
            output_root = args.output_root
        self.data_io = PA2Data(self.spark, path_dict, output_root, 
                               deploy=True, input_format=input_format)

        self.data_dict, self.count_dict = self.data_io.load_all(
            input_format=input_format, no_cache=True)
        self.task_names = TASK_NAMES
        self.synonmys = synonmys
Exemplo n.º 3
0
    def test_get_set_reset_option(self):
        self.assertEqual(ks.get_option('test.config'), 'default')

        ks.set_option('test.config', 'value')
        self.assertEqual(ks.get_option('test.config'), 'value')

        ks.reset_option('test.config')
        self.assertEqual(ks.get_option('test.config'), 'default')
Exemplo n.º 4
0
    def test_unknown_option(self):
        with self.assertRaisesRegex(config.OptionError, 'No such option'):
            ks.get_option('unknown')

        with self.assertRaisesRegex(config.OptionError, "Available options"):
            ks.set_option('unknown', 'value')

        with self.assertRaisesRegex(config.OptionError, "test.config"):
            ks.reset_option('unknown')
Exemplo n.º 5
0
    def test_unknown_option(self):
        with self.assertRaisesRegex(config.OptionError, 'No such key'):
            ks.get_option('unknown')

        with self.assertRaisesRegex(config.OptionError, "No such key"):
            ks.set_option('unknown', 'value')

        with self.assertRaisesRegex(config.OptionError, "No such key"):
            ks.reset_option('unknows')
Exemplo n.º 6
0
    def test_get_set_reset_option_different_types(self):
        ks.set_option('test.config.list', [1, 2, 3, 4])
        self.assertEqual(ks.get_option('test.config.list'), [1, 2, 3, 4])
        ks.set_option('test.config.list', None)
        self.assertEqual(ks.get_option('test.config.list'), None)

        ks.set_option('test.config.float', None)
        self.assertEqual(ks.get_option('test.config.float'), None)
        ks.set_option('test.config.float', 5.0)
        self.assertEqual(ks.get_option('test.config.float'), 5.0)

        ks.set_option('test.config.int', 123)
        self.assertEqual(ks.get_option('test.config.int'), 123)
Exemplo n.º 7
0
    def test_different_types(self):
        with self.assertRaisesRegex(ValueError, "was <class 'int'>"):
            ks.set_option('test.config.list', 1)

        with self.assertRaisesRegex(ValueError, "however, expected types are"):
            ks.set_option('test.config.float', 'abc')

        with self.assertRaisesRegex(ValueError, "[<class 'int'>]"):
            ks.set_option('test.config.int', 'abc')

        with self.assertRaisesRegex(ValueError,
                                    "(<class 'int'>, <class 'NoneType'>)"):
            ks.set_option('test.config.int.none', 'abc')
Exemplo n.º 8
0
    def test_different_types(self):
        with self.assertRaisesRegex(
                TypeError, "The configuration value for 'test.config'"):
            ks.set_option('test.config', 1)

        with self.assertRaisesRegex(TypeError, "was <class 'int'>"):
            ks.set_option('test.config.list', 1)

        with self.assertRaisesRegex(TypeError,
                                    "however, <class 'float'> is expected."):
            ks.set_option('test.config.float', 'abc')

        with self.assertRaisesRegex(TypeError,
                                    "however, <class 'int'> is expected."):
            ks.set_option('test.config.int', 'abc')
Exemplo n.º 9
0
 def test_check_func(self):
     with self.assertRaisesRegex(ValueError, "bigger then 0"):
         ks.set_option('test.config.int', -1)
Exemplo n.º 10
0
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
import pandas as pd
import numpy as np
import databricks.koalas as ks
from pyspark.sql import SparkSession
from datetime import datetime

ks.set_option('compute.default_index_type', 'distributed')

# %% [markdown]
# ## YouGov - Wearing Mask in public

# %%
start = datetime.now()

##Chargement dataset
df = ks.read_csv("gs://dask-vs-koalas/wearing_face_mask_public.csv", sep=";")

##Transformation du dataset = 1 ligne par date/pays
format = '%Y-%m-%d %H:%M:%S'
df['DateTime'] = ks.to_datetime(df['DateTime'], format=format)
df['DateTime'] = df['DateTime'].dt.normalize()

#### 1er changement : autoriser les opérations sur 2 dataframes différents (ks.set_option('compute.ops_on_diff_frames', True)
#### ou faire un groupby sur la colonne (comportement légèrement différent de pandas car la colonne de group_by devient un index et disparait de la projection)
# df = df.sort_values('DateTime').groupby(df['DateTime']).max()
df = df.sort_values('DateTime').groupby(['DateTime'], as_index=False).max()
# df = df.set_index(pd.DatetimeIndex(df['DateTime'])).drop(['DateTime'], axis=1)
df = df.set_index('DateTime')
Exemplo n.º 11
0
def process_log_data(spark, input_data, output_data):
    """process log_data to create users, time ,songsplay table"""
    # get filepath to log data file
    log_data = 'data/*.json'

    # read log data file
    log_kdf = ks.read_json(log_data)

    # filter by actions for song plays
    df = log_kdf.filter(log_kdf.page == "NextSong")

    # extract columns for users table
    users_table = ks.sql(""" SELECT 
                           DISTINCT
                           userId,
                           firstName,
                           lastName,
                           gender,
                           level 
                           FROM {df}""")

    # write users table to parquet files
    (users_table.to_spark().write.parquet(f'{output_data}/users',
                                          mode="overwrite"))

    # create timestamp column from original timestamp column
    df['timestamp'] = ks.to_datetime(df['ts'], unit='ns')

    # create datetime column from original timestamp column
    df['datetime'] = ks.to_datetime(df['ts'])

    # extract columns to create time table
    time_table = (ks.sql("""
            SELECT
            DISTINCT
           datetime as start_time,
           extract(day from datetime) as day,
           extract(week from datetime) as week,
           extract(month from datetime) as month,
           extract(year from datetime) as year,
           extract(hour from datetime) as hour
           from {df}
                        """))

    # to enable join on table
    ks.set_option('compute.ops_on_diff_frames', True)

    # add weekday columns
    time_table['weekday'] = df.datetime.dt.weekday

    # write time table to parquet files partitioned by year and month
    (time_table.to_spark().write.partitionBy('year', 'month').parquet('time/'))

    # read in song data to use for songplays table
    song_df = ks.read_json('data/song_data/*/*/*/*.json')

    # convert ts to datetime
    log_kdf["ts"] = ks.to_datetime(log_kdf['ts'])

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = (ks.sql(""" SELECT 
                             DISTINCT
                             row_number() over (ORDER BY e.userId) songplay_id,
                             e.ts AS start_time,
                             extract(month from e.ts) as month,
                             extract(year from e.ts) as year,
                             e.userId AS user_id,
                             e.level AS level,
                             s.song_id AS song_id,
                             s.artist_id AS artist_id,
                             e.sessionId as session_id,
                             e.location AS location,
                             e.userAgent AS user_agent
                             FROM {log_kdf} as e join {song_df} as s ON
                             (e.artist = s.artist_name AND 
                             e.song = s.title AND 
                             e.length= s.duration)
                             WHERE e.page='NextSong'

             """))

    # write songplays table to parquet files partitioned by year and month
    (songplays_table.to_spark().write.partitionBy("year", "month").parquet(
        f'{output_data}/songplayes', mode="overwrite"))