def test_read_csv_with_dtype(self): with self.csv_file(self.csv_text) as fn: self.assert_eq(ks.read_csv(fn), pd.read_csv(fn), almost=True) self.assert_eq(ks.read_csv(fn, dtype=str), pd.read_csv(fn, dtype=str)) self.assert_eq(ks.read_csv(fn, dtype={'amount': 'int64'}), pd.read_csv(fn, dtype={'amount': 'int64'}))
def get_gender_feature(): train_user = ks.read_csv("data/train_preliminary/user.csv") train_click_log = ks.read_csv("data/train_preliminary/click_log.csv") train_data = train_user.merge(train_click_log, on="user_id", how='inner') sql = ''' select creative_id, gender, sum(nvl(click_times, 0)) click_times from {train_data} group by creative_id, gender ''' age_data = ks.sql(sql, train_data=train_data) age_data.cache() sql = ''' SELECT creative_id, gender, click_times / sum(click_times) OVER (PARTITION BY creative_id ORDER BY click_times DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) gender_dist FROM {age_data} ''' age_dist_data = ks.sql(sql, age_data=age_data) age_dist_data.head(10) age_dist_data.cache() age_dist_pivot = age_dist_data.pivot(index='creative_id', columns='gender', values='gender_dist') age_dist_pivot.columns = ['gender_' + str(ele) for ele in range(1, 3)] age_dist_pivot = age_dist_pivot.reset_index() age_dist_pivot.fillna(0, inplace=True) age_dist_pivot.to_csv('./data/gender_dist', num_files=1)
def test_read_csv_with_comment(self): with self.csv_file(self.csv_text_with_comments) as fn: expected = pd.read_csv(fn, comment="#") actual = ks.read_csv(fn, comment="#") self.assertPandasAlmostEqual(expected, actual.toPandas()) self.assertRaisesRegex( ValueError, "Only length-1 comment characters supported", lambda: ks.read_csv(fn, comment="").show(), ) self.assertRaisesRegex( ValueError, "Only length-1 comment characters supported", lambda: ks.read_csv(fn, comment="##").show(), ) self.assertRaisesRegex( ValueError, "Only length-1 comment characters supported", lambda: ks.read_csv(fn, comment=1), ) self.assertRaisesRegex( ValueError, "Only length-1 comment characters supported", lambda: ks.read_csv(fn, comment=[1]), )
def get_test_data(): ''' 合并测试集 ''' test_click_log = ks.read_csv("../data/test/click_log.csv") test_ad = ks.read_csv("../data/test/ad.csv") test_data = test_click_log.merge(test_ad, on="creative_id") return test_data
def extract_data(): ''' Read the raw data ''' kdf_categories = ks.read_csv('../data/raw_data/disaster_categories.csv') kdf_messages = ks.read_csv('../data/raw_data/disaster_messages.csv') return kdf_messages,kdf_categories
def test_read_csv_with_squeeze(self): with self.csv_file(self.csv_text) as fn: expected = pd.read_csv(fn, squeeze=True, usecols=["name"]) actual = ks.read_csv(fn, squeeze=True, usecols=["name"]) self.assert_eq(expected, actual, almost=True) expected = pd.read_csv(fn, squeeze=True, usecols=["name", "amount"]) actual = ks.read_csv(fn, squeeze=True, usecols=["name", "amount"]) self.assert_eq(expected, actual, almost=True)
def test_read_csv(self): with self.csv_file(self.csv_text) as fn: def check(header='infer', names=None, usecols=None): expected = pd.read_csv(fn, header=header, names=names, usecols=usecols) actual = ks.read_csv(fn, header=header, names=names, usecols=usecols) self.assertPandasAlmostEqual(expected, actual.toPandas()) check() check(header=None) check(header=0) check(names=['n', 'a']) check(header=0, names=['n', 'a']) check(usecols=[1]) check(usecols=[1, 0]) check(usecols=['amount']) check(usecols=['amount', 'name']) check(usecols=[]) check(usecols=[1, 1]) check(usecols=['amount', 'amount']) check(names=['n', 'a'], usecols=['a']) # check with pyspark patch. expected = pd.read_csv(fn) actual = ks.read_csv(fn) self.assertPandasAlmostEqual(expected, actual.toPandas()) self.assertRaisesRegex(ValueError, 'non-unique', lambda: ks.read_csv(fn, names=['n', 'n'])) self.assertRaisesRegex( ValueError, 'does not match the number.*3', lambda: ks.read_csv(fn, names=['n', 'a', 'b'])) self.assertRaisesRegex( ValueError, 'does not match the number.*3', lambda: ks.read_csv(fn, header=0, names=['n', 'a', 'b'])) self.assertRaisesRegex(ValueError, 'Usecols do not match.*3', lambda: ks.read_csv(fn, usecols=[1, 3])) self.assertRaisesRegex( ValueError, 'Usecols do not match.*col', lambda: ks.read_csv(fn, usecols=['amount', 'col'])) self.assertRaisesRegex(ValueError, 'Unknown header argument 1', lambda: ks.read_csv(fn, header='1')) expected_error_message = ( "'usecols' must either be list-like of all strings, " "all unicode, all integers or a callable.") self.assertRaisesRegex( ValueError, expected_error_message, lambda: ks.read_csv(fn, usecols=[1, 'amount'])) # check with index_col expected = pd.read_csv(fn).set_index('name') actual = ks.read_csv(fn, index_col='name') self.assertPandasAlmostEqual(expected, actual.toPandas())
def get_train_data(): ''' 合并训练数据集 ''' train_user = ks.read_csv("../data/train_preliminary/user.csv") train_click_log = ks.read_csv("../data/train_preliminary/click_log.csv") train_ad = ks.read_csv("../data/train_preliminary/ad.csv") train_data = train_user.merge(train_click_log, on="user_id", how='inner').merge(train_ad, on="creative_id", how='inner') return train_data
def test_read_csv_with_comment(self): with self.csv_file(self.csv_text_with_comments) as fn: expected = pd.read_csv(fn, comment='#') actual = koalas.read_csv(fn, comment='#') self.assertPandasAlmostEqual(expected, actual.toPandas()) self.assertRaisesRegex(ValueError, 'Only length-1 comment characters supported', lambda: koalas.read_csv(fn, comment='').show()) self.assertRaisesRegex(ValueError, 'Only length-1 comment characters supported', lambda: koalas.read_csv(fn, comment='##').show()) self.assertRaisesRegex(ValueError, 'Only length-1 comment characters supported', lambda: koalas.read_csv(fn, comment=1)) self.assertRaisesRegex(ValueError, 'Only length-1 comment characters supported', lambda: koalas.read_csv(fn, comment=[1]))
def combine_feature(train=True): user_filename = 'data/train_preliminary/user.csv' if train else './data/test/click_log.csv' result_filename = './data/combine_feature' if train else './data/combine_feature_test' user_df = ks.read_csv(user_filename) if not train: user_df = ks.sql('select distinct user_id from {user_df}', user_df=user_df) wv_feature = ks.read_csv('data/wv_features.csv') nn_feature = ks.read_csv('data/nn_features.csv') stats_data = ks.read_csv( "data/stats_features/part-00000-f6695da4-6d9f-4ba4-80b1-d370e636696b-c000.csv" ) all_features = user_df.merge(wv_feature, on='user_id').merge( nn_feature, on='user_id').merge(stats_data, on='user_id') print(all_features.shape) all_features.to_csv(result_filename, num_files=1)
def check(header="infer", names=None, usecols=None, index_col=None): expected = pd.read_csv( fn, header=header, names=names, usecols=usecols, index_col=index_col ) actual = ks.read_csv( fn, header=header, names=names, usecols=usecols, index_col=index_col ) self.assert_eq(expected, actual, almost=True)
def get_dataframe_koalas(name): df = ks.read_csv( "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=" + name + "&apikey=WCXVE7BAD668SJHL&datatype=csv") df = df.rename(columns={"timestamp": "Date"}) df = df.set_index(df["Date"]) df = df.sort_index() df = df.drop(columns=["open", "low", "high", "volume"]) return df
def check(header="infer", names=None, usecols=None): expected = pd.read_csv(fn, header=header, names=names, usecols=usecols) actual = ks.read_csv(fn, header=header, names=names, usecols=usecols) self.assertPandasAlmostEqual(expected, actual.toPandas())
def get_ad_dict(): train_ad = ks.read_csv("../data/train_preliminary/ad.csv") test_ad = ks.read_csv("../data/test/ad.csv") ad_info = ks.concat([train_ad, test_ad], axis=0) ad_info = ad_info.drop_duplicates() ad_dict_sql = ''' select creative_id, product_id, product_category, advertiser_id, industry, row_number() over (partition by product_id, product_category,advertiser_id,industry order by 1 desc) ad_rn from {ad_info} ''' ad_info = ks.sql(ad_dict_sql, ad_info=ad_info) print(ad_info.nunique()) ad_info.to_csv('../data/ad_info', index=False, num_files=1)
def test_read_csv(self): with self.csv_file(self.csv_text) as fn: def check(header='infer', names=None, usecols=None): expected = pd.read_csv(fn, header=header, names=names, usecols=usecols) actual = koalas.read_csv(fn, header=header, names=names, usecols=usecols) self.assertPandasAlmostEqual(expected, actual.toPandas()) check() check(header=None) check(header=0) check(names=['n', 'a']) check(header=0, names=['n', 'a']) check(usecols=[1]) check(usecols=[1, 0]) check(usecols=['amount']) check(usecols=['amount', 'name']) if LooseVersion("0.20.0") <= LooseVersion(pd.__version__): check(usecols=lambda x: x == 'amount') check(usecols=[]) check(usecols=[1, 1]) check(usecols=['amount', 'amount']) if LooseVersion("0.20.0") <= LooseVersion(pd.__version__): check(usecols=lambda x: x == 'a') check(names=['n', 'a'], usecols=['a']) # check with pyspark patch. expected = pd.read_csv(fn) actual = koalas.read_csv(fn) self.assertPandasAlmostEqual(expected, actual.toPandas()) self.assertRaisesRegex( ValueError, 'non-unique', lambda: koalas.read_csv(fn, names=['n', 'n'])) self.assertRaisesRegex( ValueError, 'Names do not match.*3', lambda: koalas.read_csv(fn, names=['n', 'a', 'b'])) self.assertRaisesRegex( ValueError, 'Names do not match.*3', lambda: koalas.read_csv(fn, header=0, names=['n', 'a', 'b'])) self.assertRaisesRegex(ValueError, 'Usecols do not match.*3', lambda: koalas.read_csv(fn, usecols=[1, 3])) self.assertRaisesRegex( ValueError, 'Usecols do not match.*col', lambda: koalas.read_csv(fn, usecols=['amount', 'col']))
def loadData(self, path=None): ''' This method loads a dataset file as a Pandas DataFrame, assuming that the dataset file is in csv format. It also shuffles the loaded dataset as part of data preprocessing. ''' if (path != None): path = os.path.join(path, self.dataset_file_name) else: path = self.dataset_file_name if self.use_koalas: dataset = ks.read_csv(path) else: dataset = pd.read_csv(path) # shuffle data self.dataset = dataset.sample(frac=1.0) return self.dataset
def test_read_with_spark_schema(self): with self.csv_file(self.csv_text_2) as fn: actual = ks.read_csv( fn, names="A string, B string, C long, D long, E long") expected = pd.read_csv(fn, names=["A", "B", "C", "D", "E"]) self.assertEqual(repr(expected), repr(actual))
def test_read_csv(self): with self.csv_file(self.csv_text) as fn: def check(header="infer", names=None, usecols=None): expected = pd.read_csv(fn, header=header, names=names, usecols=usecols) actual = ks.read_csv(fn, header=header, names=names, usecols=usecols) self.assertPandasAlmostEqual(expected, actual.toPandas()) check() check(header=None) check(header=0) check(names=["n", "a"]) check(header=0, names=["n", "a"]) check(usecols=[1]) check(usecols=[1, 0]) check(usecols=["amount"]) check(usecols=["amount", "name"]) check(usecols=[]) check(usecols=[1, 1]) check(usecols=["amount", "amount"]) check(names=["n", "a"], usecols=["a"]) # check with pyspark patch. expected = pd.read_csv(fn) actual = ks.read_csv(fn) self.assertPandasAlmostEqual(expected, actual.toPandas()) self.assertRaisesRegex(ValueError, "non-unique", lambda: ks.read_csv(fn, names=["n", "n"])) self.assertRaisesRegex( ValueError, "does not match the number.*3", lambda: ks.read_csv(fn, names=["n", "a", "b"]), ) self.assertRaisesRegex( ValueError, "does not match the number.*3", lambda: ks.read_csv(fn, header=0, names=["n", "a", "b"]), ) self.assertRaisesRegex(ValueError, "Usecols do not match.*3", lambda: ks.read_csv(fn, usecols=[1, 3])) self.assertRaisesRegex( ValueError, "Usecols do not match.*col", lambda: ks.read_csv(fn, usecols=["amount", "col"]), ) self.assertRaisesRegex(ValueError, "Unknown header argument 1", lambda: ks.read_csv(fn, header="1")) expected_error_message = ( "'usecols' must either be list-like of all strings, " "all unicode, all integers or a callable.") self.assertRaisesRegex( ValueError, expected_error_message, lambda: ks.read_csv(fn, usecols=[1, "amount"])) # check with index_col expected = pd.read_csv(fn).set_index("name") actual = ks.read_csv(fn, index_col="name") self.assertPandasAlmostEqual(expected, actual.toPandas())
def __str__(self): return str(self.value) print((A(1) + 1) * 2) spark = SparkSession.builder.master("local").getOrCreate() df = spark.read.format('csv')\ .option('header', True)\ .option('sep', ';')\ .schema("id INT, name STRING, surname STRING, age INT ")\ .load('user.csv') import databricks.koalas as ks df = ks.read_csv('user.csv', sep=";", header=0) print(df[df['age'] > 30]) (df.to_spark()) df.toPandas() #Pandas to koalas ks.from_pandas(df) #Spark to koalas ks.DataFrame(df.to_spark()) from pyspark.sql.types import * schema = StructType( [StructField('id', IntegerType()), StructField('name', StringType()), StructField('surname', StringType()), StructField('age', IntegerType(), False)])
def test_read_csv_with_parse_dates(self): self.assertRaisesRegex(ValueError, 'parse_dates', lambda: koalas.read_csv('path', parse_dates=True))
def test_read_csv_with_mangle_dupe_cols(self): self.assertRaisesRegex(ValueError, 'mangle_dupe_cols', lambda: koalas.read_csv('path', mangle_dupe_cols=False))
def test_read_with_spark_schema(self): with self.csv_file(self.csv_text_2) as fn: actual = koalas.read_csv(fn, names="A string, B string, C long, D long, E long") expected = pd.read_csv(fn, names=['A', 'B', 'C', 'D', 'E']) self.assertEqual(repr(expected), repr(actual))
# %% import numpy as np import pandas as pd # %% [markdown] # ### Read CSV File. # %% # location of data data_path = os.path.join("data", "nyc_restaurant_inspection_results_sample1.csv") # %% # import to kolas df df = ks.read_csv(data_path) # %% # import to pandas df pddf = pd.read_csv(data_path) # %% [markdown] # ### Memory usage # %% print("koalas memory usage is {m} bytes.".format(m=sys.getsizeof(df))) print("pandas memory usage is {m:.2f} kilobytes.".format( m=sys.getsizeof(pddf) / 10**3)) # %% [markdown] # ## Selecting Rows and Columns