def to_pandas(id: int, columns: list = None, samples: list = None): """ - After unzipping `gzip.open()`, bytesio still needed to be read into PyArrow before being read into Pandas. - All methods return all columns by default if they receive None: `pc.read_csv(read_options.column_names)`, `pa.read_table()`, `pd.read_csv(uscols)`, `pd.read_parquet(columns)` """ d = Dataset.get_by_id(id) is_compressed = d.is_compressed ff = d.file_format # When user provides only 1 column and forgets to [] it (e.g. the label column). if type(columns) == str: columns = [columns] data = d.data bytesio_data = io.BytesIO(data) if (ff == 'csv') or (ff == 'tsv'): # `pc.ReadOptions.column_names` verifies the existence of the names, does not filter for them. if is_compressed: bytesio_csv = gzip.open(bytesio_data) if ff == 'tsv': parse_opt = pc.ParseOptions(delimiter='\t') tbl = pc.read_csv(bytesio_csv, parse_options=parse_opt) else: tbl = pc.read_csv(bytesio_csv) df = tbl.to_pandas() if columns is not None: df = df.filter(columns) else: if ff == 'tsv': df = pd.read_csv(bytesio_data, sep='\t', usecols=columns) else: df = pd.read_csv(bytesio_data, usecols=columns) elif ff == 'parquet': if is_compressed: bytesio_parquet = gzip.open(bytesio_data) tbl = pq.read_table(bytesio_parquet, columns=columns) df = tbl.to_pandas() else: df = pd.read_parquet(bytesio_data, columns=columns) if samples is not None: df = df.iloc[samples] d_dtype = d.dtype if d_dtype is not None: if (type(d_dtype) == dict): if columns is None: columns = d.columns # need to prune out the excluded columns from the dtype dict d_dtype_cols = list(d_dtype.keys()) for col in d_dtype_cols: if col not in columns: del d_dtype[col] df = df.astype(d_dtype) return df
def load_model_movies_mtx(): genome_dim = read_csv('genome-tags.csv').num_rows genome_table = read_csv('genome-scores.csv') movie_ids = genome_table['movieId'].to_numpy() scores = genome_table['relevance'].to_numpy() model_movies_mtx = {} for i in range(0, len(scores), genome_dim): model_movies_mtx[movie_ids[i]] = scores[i:i + genome_dim] return model_movies_mtx, genome_dim
def test_import_table_long_csv(self): pa_table = csv.read_csv(self.csv_file) new_table = self.session.import_table(pa_table) pa_table2 = new_table.snapshot() self.assertEqual(pa_table2, pa_table) df = pa_table2.to_pandas() self.assertEquals(1000, len(df.index))
def parse_green_taxi_csv(fobj): """ Parse a binary file object of cleaned "green taxi" CSV data as returned by the "read_green_taxi_csv" function, and return a PyArrow table. """ convert_options = ConvertOptions( column_types=SCHEMA, false_values=['N'], null_values=[''], timestamp_parsers=['%Y-%m-%d %H:%M:%S'], true_values=['Y'], ) parse_options = ParseOptions(quote_char=False) read_options = ReadOptions( column_names=SCHEMA.names, encoding=ENCODING, ) return read_csv( fobj, convert_options=convert_options, parse_options=parse_options, read_options=read_options, )
def _read_csv_with_offset_pyarrow_on_ray(fname, num_splits, start, end, kwargs, header): # pragma: no cover """Use a Ray task to read a chunk of a CSV into a pyarrow Table. Note: Ray functions are not detected by codecov (thus pragma: no cover) Args: fname: The filename of the file to open. num_splits: The number of splits (partitions) to separate the DataFrame into. start: The start byte offset. end: The end byte offset. kwargs: The kwargs for the pyarrow `read_csv` function. header: The header of the file. Returns: A list containing the split pyarrow Tables and the the number of rows of the tables as the last element. This is used to determine the total length of the DataFrame to build a default Index. """ bio = open(fname, "rb") # The header line for the CSV file first_line = bio.readline() bio.seek(start) to_read = header + first_line + bio.read(end - start) bio.close() table = csv.read_csv(BytesIO(to_read), parse_options=csv.ParseOptions(header_rows=1)) chunksize = get_default_chunksize(table.num_columns, num_splits) chunks = [ pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)]) for i in range(num_splits) ] return chunks + [ table.num_rows, pandas.Series([t.to_pandas_dtype() for t in table.schema.types], index=table.schema.names), ]
def pa_read_csv( input_file: Union[IO, str], schema: Union[pa.Schema, None] = None, expect_full_schema: bool = True, **kwargs, ): """Read a csv file into an Arrow table. Args: input_file (Union[IO, str]): the CSV you want to read. string, path or file-like object. schema (pyarrow.Schema): pyarrow Schema with the expected columns wanted. If unset pyarrow will infer datatypes. expect_full_schema (bool, optional): if True, pyarrow reader will expect the input schema to have fields for every col in the input file. If False, then will only cast columns that are listed in the schema, leaving all other columns to their default type on read. **kwargs (optional): Additional kwargs are passed to pyarrow.csv.read_csv Returns: pyarrow.Table: the csv file in pyarrow format. """ if schema: schema = _get_arrow_schema(schema) pa_csv_table = csv.read_csv(input_file=input_file, **kwargs) if schema: pa_csv_table = cast_arrow_table_to_schema( pa_csv_table, schema=schema, expect_full_schema=expect_full_schema) return pa_csv_table
def import_taxi_records(dh_session: Session) -> Table: # download the CSV data and read it into a pyarrow table and prepare it for uploading into DH csv_file_name = download_csv( url= "https://nyc-tlc.s3.amazonaws.com/trip+data/yellow_tripdata_2020-12.csv" ) pa_table = csv.read_csv(csv_file_name) # drop unwanted columns unwanted_columns = [ "tpep_pickup_datetime", "tpep_dropoff_datetime", "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID" ] pa_table = pa_table.drop(unwanted_columns) # drop any column with a unsupported data type for column, column_name in zip(pa_table.columns, pa_table.column_names): if not is_deephaven_compatible(column.type): print( f"drop column: {column_name} because of unsupported data type {column.type}" ) pa_table = pa_table.drop([column_name]) # upload the pyarrow table to the Deephaven server return dh_session.import_table(pa_table)
def feature_split(): train_df = csv.read_csv('./new_train_normal.csv').to_pandas() print('train dataset feature split') cid_features = [ 'id', 'click', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21' ] banner_features = ['id', 'click', 'banner_pos'] app_features = ['id', 'click', 'app_id', 'app_domain', 'app_category'] site_features = ['id', 'click', 'site_id', 'site_domain', 'site_category'] device_features = [ 'id', 'click', 'device_id', 'device_ip', 'device_type', 'device_conn_type' ] day_features = ['id', 'click', 'day', 'date'] preprocess.feature_split(train_df, cid_features) preprocess.feature_split(train_df, banner_features) preprocess.feature_split(train_df, app_features) preprocess.feature_split(train_df, site_features) preprocess.feature_split(train_df, device_features) preprocess.feature_split(train_df, day_features) print("train feature split compeleted.")
def test_count_by(self): pa_table = csv.read_csv(self.csv_file) test_table = self.session.import_table(pa_table) num_distinct_a = test_table.select_distinct(column_names=["a"]).size result_table = test_table.count_by(count_column="b", column_names=["a"]) self.assertEqual(result_table.size, num_distinct_a)
def test_time_parsing(): target = { 'timestamp': [ datetime.datetime(2010, 4, 1, 0, 0), datetime.datetime(2010, 4, 1, 0, 30), datetime.datetime(2010, 4, 1, 1, 0) ], 'symbol': ['USD/JPY', 'USD/JPY', 'USD/JPY'], 'high': [93.526, 93.475, 93.421], 'low': [93.361, 93.352, 93.326], 'open': [93.518, 93.385, 93.391], 'close': [93.382, 93.391, 93.384], 'spread': [0.005, 0.006, 0.006], 'volume': [3049, 2251, 1577] } fp = io.BytesIO( b'timestamp,symbol,high,low,open,close,spread,volume\n' b'2010-04-01 00:00:00,USD/JPY,93.52600,93.36100,93.51800,93.38200,0.00500,3049\n' b'2010-04-01 00:30:00,USD/JPY,93.47500,93.35200,93.38500,93.39100,0.00600,2251\n' b'2010-04-01 01:00:00,USD/JPY,93.42100,93.32600,93.39100,93.38400,0.00600,1577\n' ) fp.seek(0) table = csv.read_csv(fp) assert table engine.importArrowTable("time_parsing", table) assert bool(engine.closed) == False cursor = engine.executeDML("select * from time_parsing") assert cursor batch = cursor.getArrowRecordBatch() assert batch assert batch.to_pydict() == target
def test_usecols_csv(): target = { 'a': [1, 2, 3, 4, 5, 6], 'b': [2, 3, 4, 5, 6, 7], 'c': [3, 4, 5, 6, 7, 8], 'd': [4, 5, 6, 7, 8, 9], 'e': ['5', '6', '7', '8', '9', '0'] } fp = io.BytesIO( b'a,b,c,d,e\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n4,5,6,7,8\n5,6,7,8,9\n6,7,8,9,0' ) fp.seek(0) table = csv.read_csv(fp, convert_options=csv.ConvertOptions( column_types={ 'a': pa.int32(), 'b': pa.int64(), 'c': pa.int64(), 'd': pa.int64(), 'e': pa.string(), })) assert table engine.importArrowTable("usecols", table) assert bool(engine.closed) == False cursor = engine.executeDML("select * from usecols") assert cursor batch = cursor.getArrowRecordBatch() assert batch assert batch.to_pydict() == target
def test_pyarrow_statistics(): table = pv.read_csv("./data/pets/pets1.csv") pq.write_table(table, "./tmp/pyarrow_out/pets1.parquet") parquet_file = pq.ParquetFile("./tmp/pyarrow_out/pets1.parquet") stats = parquet_file.metadata.row_group(0).column(1).statistics assert stats.min == 1 assert stats.max == 9
def _read_table_arrow(self, source: tp.BinaryIO, schema: pa.Schema) -> pa.Table: try: read_options = pa_csv.ReadOptions() read_options.encoding = 'utf-8' read_options.use_threads = False parse_options = pa_csv.ParseOptions() parse_options.newlines_in_values = True convert_options = pa_csv.ConvertOptions() convert_options.include_columns = schema.names convert_options.column_types = { n: t for (n, t) in zip(schema.names, schema.types) } convert_options.strings_can_be_null = True convert_options.quoted_strings_can_be_null = False return pa_csv.read_csv(source, read_options, parse_options, convert_options) except pa.ArrowInvalid as e: err = f"CSV file decoding failed, content is garbled" self._log.exception(err) raise _ex.EDataCorruption(err) from e except pa.ArrowKeyError as e: err = f"CSV file decoding failed, one or more columns is missing" self._log.error(err) self._log.exception(str(e)) raise _ex.EDataCorruption(err) from e
def test_head_tail(self): ops = [Table.head, Table.tail] pa_table = csv.read_csv(self.csv_file) test_table = self.session.import_table(pa_table) for op in ops: result_table = op(test_table, num_rows=50) self.assertEqual(result_table.size, 50)
def test_select_distinct(self): pa_table = csv.read_csv(self.csv_file) test_table = self.session.import_table(pa_table) unique_table = test_table.select_distinct(cols=["a"]) self.assertLessEqual(unique_table.size, pa_table.num_rows) unique_table = test_table.select_distinct(cols=[]) self.assertLessEqual(unique_table.size, pa_table.num_rows)
def read_feats(file_name): attrs = csv.read_csv( file_name, read_options=pyarrow.csv.ReadOptions(autogenerate_column_names=True), parse_options=pyarrow.csv.ParseOptions(delimiter=' ')) num_cols = len(attrs.columns) return np.stack([attrs.columns[i].to_numpy() for i in range(num_cols)], 1)
def test_group(self): pa_table = csv.read_csv(self.csv_file) test_table = self.session.import_table(pa_table) grouped_table = test_table.group_by(by=["a", "c"]) self.assertLessEqual(grouped_table.size, test_table.size) grouped_table = test_table.group_by() self.assertLessEqual(grouped_table.size, 1)
def make(self, key): key = (CompassFile * CompassTable & key).fetch1() print('populating for {file}'.format(**key)) tab = csv.read_csv(key['file']) u_names = list(tab[key['partition_col']].unique()) rows = [dict(procedure=n, type=key['type']) for n in u_names] self.insert(rows, skip_duplicates=True)
def test_head_tail_by(self): ops = [Table.head_by, Table.tail_by] pa_table = csv.read_csv(self.csv_file) test_table = self.session.import_table(pa_table) for op in ops: result_table = op(test_table, num_rows=1, by=["a"]) self.assertLessEqual(result_table.size, test_table.size)
def parse(self, **kwargs): import pyarrow as pa import pyarrow.csv as csv fname = kwargs.pop("fname", None) num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) header = kwargs.pop("header", None) bio = open(fname, "rb") # The header line for the CSV file first_line = bio.readline() bio.seek(start) to_read = header + first_line + bio.read(end - start) bio.close() table = csv.read_csv(BytesIO(to_read), parse_options=csv.ParseOptions(header_rows=1)) chunksize = get_default_chunksize(table.num_columns, num_splits) chunks = [ pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)]) for i in range(num_splits) ] return chunks + [ table.num_rows, pandas.Series( [t.to_pandas_dtype() for t in table.schema.types], index=table.schema.names, ), ]
def format_to_parquet(src_file): if not src_file.endswith('.csv'): logging.error( "Can only accept source files in CSV format, for the moment") return table = pv.read_csv(src_file) pq.write_table(table, src_file.replace('.csv', '.parquet'))
def test_csv_fillna(): target = { 'CRIM': [0.00632], 'ZN': [18.0], 'INDUS': [2.31], 'CHAS': [0.0], 'NOX': [0.538], 'RM': [6.575], 'AGE': [65.2], 'DIS': [4.09], 'RAD': [1.0], 'TAX': [296.0], 'PTRATIO': [15.3], 'B': [396.9], 'LSTAT': [4.98], 'PRICE': [24.0] } fp = io.BytesIO( b',CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE\n' b'0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0\n' ) fp.seek(0) table = csv.read_csv(fp) assert table engine.importArrowTable("csv_fillna", table) assert bool(engine.closed) == False cursor = engine.executeDML( "select CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE from csv_fillna" ) assert cursor batch = cursor.getArrowRecordBatch() assert batch assert batch.to_pydict() == target
def multi_calc(path_list): for j in range(len(path_list)): train_df = csv.read_csv(path_list[j]).to_pandas() # train_df = train_df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False) train_df = train_df.sample(frac=0.2, replace=True).reset_index() not_click = train_df['click'] == 0 not_clk_df = train_df[not_click] not_clk_df = not_clk_df.sample(frac=0.2, replace=True) is_click = train_df['click'] == 1 is_clk_df = train_df[is_click] print(len(not_clk_df)) print(len(is_clk_df)) train_df = pd.concat([not_clk_df, is_clk_df]) train_df = train_df.reset_index() train_df = train_df.drop(['level_0'], axis=1) print(f'{path_list[j]} : traing dataset preprocessing....') n = 10000 list_df = [train_df[i:i+n] for i in range(0,train_df.shape[0], n)] targets = ['site_id','site_domain','site_category', 'app_id','app_domain','app_category', 'device_id','device_ip','device_model'] # 첫번째 청크 처리 # pre_df = drop_cols(list_df[0]) pre_df = list_df[0].copy() for k in range(len(targets)): pre_df = hashing_info(pre_df, targets[k]) # pre_df, label[k] = mapping_info(pre_df, targets[k], label[k]) pre_df = split_hour(pre_df) list_df[0] = pre_df # 남은 청크들 처리 + concat for idx in tqdm(range(1, len(list_df))): # pre_df = drop_cols(list_df[idx]) pre_df = list_df[idx].copy() for k in range(len(targets)): pre_df = hashing_info(pre_df, targets[k]) pre_df = split_hour(pre_df) list_df[0] = pd.concat([list_df[0], pre_df]) train_df = list_df[0].reset_index() train_df = train_df.drop(['level_0'], axis=1) train_df = train_df.drop(['index'], axis=1) # name = './train_preprocess/new_prepros_' + str(k) + '.csv' name = path_list[j].replace('./split/split_train_', './train_preprocess/new_prepros_') print(name) train_df.to_csv(name, index=False)
def read_stores(self): s_cols = [ 'retailer_code', 'parent_code', 'fips_state_code', 'fips_county_code', 'dma_code', 'store_zip3' ] # To reduce space -- update with dictionary arrays later store_convert = { 'year': pa.uint16(), 'dma_code': pa.uint16(), 'retailer_code': pa.uint16(), 'parent_code': pa.uint16(), 'store_zip3': pa.uint16(), 'fips_county_code': pa.uint16(), 'fips_state_code': pa.uint8() } # Use pyarrow to read CSVs and parse using the dict -- we have to fix some types again later. tmp = pa.concat_tables([ csv.read_csv( x, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions(column_types=store_convert)) for x in self.stores_dict.values() ]).to_pandas().rename(columns={'year': 'panel_year'}) # some columns have blanks --fill with zero to avoid converting to floats(!) tmp.loc[:, s_cols] = tmp.loc[:, s_cols].fillna(0) # use the compressed types my_dict = { key: value for (key, value) in type_dict.items() if key in tmp.columns } self.stores_df = tmp.astype(my_dict) return
def run_preprocess(num): pd.set_option('display.max_columns', None) path = './split/split_train_' + str(num) + '.csv' train_df = csv.read_csv(path).to_pandas() # train_df = train_df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False) start = time.time() train_df = train_df.sample(frac=0.2, replace=True).reset_index() print("time :", time.time() - start) not_click = train_df['click'] == 0 not_clk_df = train_df[not_click] not_clk_df = not_clk_df.sample(frac=0.2, replace=True) is_click = train_df['click'] == 1 is_clk_df = train_df[is_click] print(len(not_clk_df)) print(len(is_clk_df)) train_df = pd.concat([not_clk_df, is_clk_df]) train_df = train_df.reset_index() train_df = train_df.drop(['level_0'], axis=1) print(f'{path} : traing dataset preprocessing....') n = 10000 list_df = [train_df[i:i+n] for i in range(0,train_df.shape[0], n)] targets = ['site_id','site_domain','site_category', 'app_id','app_domain','app_category', 'device_id','device_ip','device_model'] # 첫번째 청크 처리 # pre_df = drop_cols(list_df[0]) pre_df = list_df[0].copy() for k in range(len(targets)): pre_df = hashing_info(pre_df, targets[k]) # pre_df, label[k] = mapping_info(pre_df, targets[k], label[k]) pre_df = split_hour(pre_df) list_df[0] = pre_df # 남은 청크들 처리 + concat for idx in tqdm(range(1, len(list_df))): # pre_df = drop_cols(list_df[idx]) pre_df = list_df[idx].copy() for k in range(len(targets)): pre_df = hashing_info(pre_df, targets[k]) pre_df = split_hour(pre_df[idx]) list_df[0] = pd.concat([list_df[0], pre_df]) train_df = list_df[0].reset_index() train_df = train_df.drop(['level_0'], axis=1) train_df = train_df.drop(['index'], axis=1) name = './train_preprocess/new_prepros_' + str(num) + '.csv'
def test_sort(self): pa_table = csv.read_csv(self.csv_file) test_table = self.session.import_table(pa_table) sorted_table = test_table.sort(order_by=["a", "b"], order=[SortDirection.DESCENDING]) df = sorted_table.snapshot().to_pandas() self.assertTrue(df.iloc[:, 0].is_monotonic_decreasing)
def test_drop_columns(self): pa_table = csv.read_csv(self.csv_file) table1 = self.session.import_table(pa_table) column_names = [] for f in table1.schema: column_names.append(f.name) table2 = table1.drop_columns(cols=column_names[:-1]) self.assertEquals(1, len(table2.schema))
def test_create_data_table_then_update(self): pa_table = csv.read_csv(self.csv_file) new_table = self.session.import_table(pa_table).update( formulas=['Sum = a + b + c + d']) pa_table2 = new_table.snapshot() df = pa_table2.to_pandas() self.assertEquals(df.shape[1], 6) self.assertEquals(1000, len(df.index))
def test_bind_table(self): pa_table = csv.read_csv(self.csv_file) user_table = self.session.import_table(pa_table) variable_name = "t" self.session.bind_table(variable_name, user_table) server_script = '''t2 = t.update("col1 = i*i")''' self.session.run_script(server_script) self.assertIn('t2', self.session.tables)
def test_close(self): pa_table = csv.read_csv(self.csv_file) table = self.session.import_table(pa_table) table.close() self.assertTrue(table.is_closed) table = self.session.empty_table(10) table.close() self.assertTrue(table.is_closed)
def test_random_csv(self): csv, expected = make_random_csv(num_cols=2, num_rows=100) csv_path = os.path.join(self.tmpdir, self.csv_filename) self.write_file(csv_path, csv) try: table = read_csv(csv_path) except pa.ArrowNotImplementedError as e: pytest.skip(str(e)) return table._validate() assert table.schema == expected.schema assert table.equals(expected) assert table.to_pydict() == expected.to_pydict()
def read_csv(self, *args, **kwargs): read_options = kwargs.setdefault('read_options', ReadOptions()) read_options.use_threads = True table = read_csv(*args, **kwargs) table._validate() return table