'--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) # Load data project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir) data_dir = os.path.join(project_dir, 'data', 'interim', 'data.parq') pf = ParquetFile(data_dir) data = pf.to_pandas() kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_loader = torch.utils.data.DataLoader(data.values, batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(data.values, batch_size=args.batch_size, shuffle=True, **kwargs) # Params noise_variance_var = 0.1
def test_datetime_partition_no_dupilcates(tempdir, partitions): df = pd.DataFrame({'partitions': partitions, 'x': [1, 2]}) write(tempdir, df, file_scheme='hive', partition_on=['partitions']) with pytest.raises(ValueError, match=r'Partition names map to the same value.*'): ParquetFile(tempdir)
def test_unicode_cols(tempdir): fn = os.path.join(tempdir, 'test.parq') df = pd.DataFrame({u"région": [1, 2, 3]}) write(fn, df) pf = ParquetFile(fn) pf.to_pandas()
def time_column(): with tmpdir() as tempdir: result = {} fn = join_path(tempdir, 'temp.parq') n = 10000000 r = np.random.randint(-1e10, 1e10, n, dtype='int64') d = pd.DataFrame({ 'w': pd.Categorical(np.random.choice(['hi', 'you', 'people'], size=n)), 'x': r.view('timedelta64[ns]'), 'y': r / np.random.randint(1, 1000, size=n), 'z': np.random.randint(0, 127, size=n, dtype=np.uint8) }) for col in d.columns: df = d[[col]] write(fn, df) with measure('%s: write, no nulls' % d.dtypes[col], result): write(fn, df, has_nulls=False) pf = ParquetFile(fn) pf.to_pandas(categories={'w': 3}) # warm-up with measure('%s: read, no nulls' % d.dtypes[col], result): pf.to_pandas(categories={'w': 3}) with measure('%s: write, no nulls, has_null=True' % d.dtypes[col], result): write(fn, df, has_nulls=True) pf = ParquetFile(fn) pf.to_pandas(categories={'w': 3}) # warm-up with measure('%s: read, no nulls, has_null=True' % d.dtypes[col], result): pf.to_pandas(categories={'w': 3}) if d.dtypes[col].kind == 'm': d.loc[n // 2, col] = pd.to_datetime('NaT') elif d.dtypes[col].kind == 'f': d.loc[n // 2, col] = np.nan elif d.dtypes[col].kind in ['i', 'u']: continue else: d.loc[n // 2, col] = None with measure('%s: write, with null, has_null=True' % d.dtypes[col], result): write(fn, df, has_nulls=True) pf = ParquetFile(fn) pf.to_pandas(categories={'w': 3}) # warm-up with measure('%s: read, with null, has_null=True' % d.dtypes[col], result): pf.to_pandas(categories={'w': 3}) with measure( '%s: write, with null, has_null=False' % d.dtypes[col], result): write(fn, df, has_nulls=False) pf = ParquetFile(fn) pf.to_pandas(categories={'w': 3}) # warm-up with measure('%s: read, with null, has_null=False' % d.dtypes[col], result): pf.to_pandas(categories={'w': 3}) return result
def test_auto_null(tempdir): tmp = str(tempdir) df = pd.DataFrame({ 'a': [1, 2, 3, 0], 'aa': [1, 2, 3, None], 'b': [1., 2., 3., np.nan], 'c': pd.to_timedelta([1, 2, 3, np.nan], unit='ms'), 'd': ['a', 'b', 'c', None], 'f': [True, False, True, True], 'ff': [True, False, None, True] }) df['e'] = df['d'].astype('category') df['bb'] = df['b'].astype('object') df['aaa'] = df['a'].astype('object') object_cols = ['d', 'ff', 'bb', 'aaa'] test_cols = list(set(df) - set(object_cols)) + ['d'] fn = os.path.join(tmp, "test.parq") with pytest.raises((TypeError, AttributeError)): ## TODO: this should be a nicer error? write(fn, df, has_nulls=False) write(fn, df, has_nulls=True) pf = ParquetFile(fn) for col in pf._schema[1:]: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL df2 = pf.to_pandas(categories=['e']) tm.assert_frame_equal(df[test_cols], df2[test_cols], check_categorical=False) tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']]) tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']]) tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']]) # not giving any value same as has_nulls=True write(fn, df) pf = ParquetFile(fn) for col in pf._schema[1:]: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL df2 = pf.to_pandas(categories=['e']) tm.assert_frame_equal(df[test_cols], df2[test_cols], check_categorical=False) tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']]) tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']]) tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']]) # 'infer' is new recommended auto-null write(fn, df, has_nulls='infer') pf = ParquetFile(fn) for col in pf._schema[1:]: if col.name in object_cols: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL else: assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED df2 = pf.to_pandas() tm.assert_frame_equal(df[test_cols], df2[test_cols], check_categorical=False) tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']]) tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']]) tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']]) # nut legacy None still works write(fn, df, has_nulls=None) pf = ParquetFile(fn) for col in pf._schema[1:]: if col.name in object_cols: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL else: assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED df2 = pf.to_pandas() tm.assert_frame_equal(df[test_cols], df2[test_cols], check_categorical=False) tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']]) tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']]) tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']])
# encoding: utf-8 """ @Author: wanghuagang @Contact: [email protected] @Project: StudyPython @File: d1 @Date: 2019/6/28 下午2:16 @Description: """ import pandas as pd from fastparquet import ParquetFile parquet_file = 'shunqiwang.parquet' pf = ParquetFile(parquet_file) df = pf.to_pandas() # type: pd.DataFrame print(df.head(3))
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs): """ Determine how to access metadata and break read into ``parts`` This logic is mostly to handle `gather_statistics=False` cases, because this also means we should avoid scanning every file in the dataset. If _metadata is available, set `gather_statistics=True` (if `gather_statistics=None`). """ parts = [] if len(paths) > 1: if gather_statistics is not False: # This scans all the files, allowing index/divisions # and filtering pf = ParquetFile(paths, open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: base, fns = _analyze_paths(paths, fs) relpaths = [path.replace(base, "").lstrip("/") for path in paths] if "_metadata" in relpaths: # We have a _metadata file, lets use it pf = ParquetFile(base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) parts = paths.copy() else: if fs.isdir(paths[0]): # This is a directory, check for _metadata, then _common_metadata paths = fs.glob(paths[0] + fs.sep + "*") base, fns = _analyze_paths(paths, fs) relpaths = [path.replace(base, "").lstrip("/") for path in paths] if "_metadata" in relpaths: # Using _metadata file (best-case scenario) pf = ParquetFile(base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) if gather_statistics is None: gather_statistics = True elif gather_statistics is not False: # Scan every file pf = ParquetFile(paths, open_with=fs.open, **kwargs.get("file", {})) else: # Use _common_metadata file if it is available. # Otherwise, just use 0th file if "_common_metadata" in relpaths: pf = ParquetFile(base + fs.sep + "_common_metadata", open_with=fs.open, **kwargs.get("file", {})) else: pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) parts = paths.copy() else: # There is only one file to read pf = ParquetFile(paths[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) return parts, pf, gather_statistics
def read_partition(cls, fs, piece, columns, index, categories=(), **kwargs): null_index_name = False if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index # Use global `parquet_file` object. Need to reattach # the desired row_group parquet_file = kwargs.pop("parquet_file", None) if isinstance(piece, tuple): if isinstance(piece[0], str): # We have a path to read from assert parquet_file is None parquet_file = ParquetFile(piece[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) rg_indices = piece[1] or list( range(len(parquet_file.row_groups))) # `piece[1]` will contain row-group indices row_groups = [parquet_file.row_groups[rg] for rg in rg_indices] elif parquet_file: # `piece[1]` will contain actual row-group objects, # but they may be pickled row_groups = piece[0] if isinstance(row_groups, bytes): row_groups = pickle.loads(row_groups) parquet_file.fmd.row_groups = row_groups # NOTE: May lose cats after `_set_attrs` call save_cats = parquet_file.cats parquet_file._set_attrs() parquet_file.cats = save_cats else: raise ValueError("Neither path nor ParquetFile detected!") if null_index_name: if "__index_level_0__" in parquet_file.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index parquet_file._dtypes = (lambda *args: parquet_file.dtypes ) # ugly patch, could be fixed # Read necessary row-groups and concatenate dfs = [] for row_group in row_groups: dfs.append( parquet_file.read_row_group_file( row_group, columns, categories, index=index, **kwargs.get("read", {}), )) return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0] else: # `piece` is NOT a tuple raise ValueError(f"Expected tuple, got {type(piece)}")
def test_write_index_false(tempdir): fn = os.path.join(tempdir, 'test.parquet') df = pd.DataFrame(0, columns=['a'], index=range(1, 3)) write(fn, df, write_index=False) rec_df = ParquetFile(fn).to_pandas() assert rec_df.index[0] == 0
def load_instances(parq_file, motifs=None, dedup=True, verbose=True): """Load pattern instances from the parquet file Args: parq_file: parquet file of motif instances motifs: dictionary of motifs of interest. key=custom motif name, value=short pattern name (e.g. {'Nanog': 'm0_p3'}) """ if motifs is not None: incl_motifs = {longer_pattern(m) for m in motifs.values()} else: incl_motifs = None if isinstance(parq_file, pd.DataFrame): dfi = parq_file else: if motifs is not None: from fastparquet import ParquetFile # Selectively load only the relevant patterns pf = ParquetFile(str(parq_file)) patterns = [shorten_pattern(pn) for pn in incl_motifs] dfi = pf.to_pandas(filters=[("pattern_short", "in", patterns)]) else: dfi = pd.read_parquet(str(parq_file), engine='fastparquet') if 'pattern' not in dfi: # assumes a hive-stored file dfi['pattern'] = dfi['dir0'].str.replace( "pattern=", "").astype(str) + "/" + dfi['dir1'].astype(str) # filter if motifs is not None: dfi = dfi[dfi.pattern.isin( incl_motifs)] # NOTE this should already be removed if 'pattern_short' not in dfi: dfi['pattern_short'] = dfi['pattern'].map( {k: shorten_pattern(k) for k in incl_motifs}) dfi['pattern_name'] = dfi['pattern_short'].map( {v: k for k, v in motifs.items()}) else: dfi['pattern_short'] = dfi['pattern'].map( {k: shorten_pattern(k) for k in dfi.pattern.unique()}) # add some columns if they don't yet exist if 'pattern_start_abs' not in dfi: dfi['pattern_start_abs'] = dfi['example_start'] + dfi['pattern_start'] if 'pattern_end_abs' not in dfi: dfi['pattern_end_abs'] = dfi['example_start'] + dfi['pattern_end'] if dedup: # deduplicate dfi_dedup = dfi.drop_duplicates([ 'pattern', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand' ]) # number of removed duplicates d = len(dfi) - len(dfi_dedup) if verbose: print("number of de-duplicated instances:", d, f"({d / len(dfi) * 100}%)") # use de-duplicated instances from now on dfi = dfi_dedup return dfi
def getNOAAData(month, yr): """Function to get noaa data for the month""" # Read station data from file that was stored in s3 try: try: s3 = s3fs.S3FileSystem() myopen = s3.open s3_resource = boto3.resource('s3') s3_resource.Object('midscapstone-whos-polluting-my-air', 'UtilFiles/uniq_station_data.parquet').load() pf = ParquetFile( 'midscapstone-whos-polluting-my-air/UtilFiles/uniq_station_data.parquet', open_with=myopen) unique_station_df = pf.to_pandas() except: raise CustomError("FILE ERROR: Unique Station Dataframe not found") # List of NOAA stations in the 35 < lat < 40 and -125 < lon < -120 bounding box station_list = [ 'KAPC', 'KBLU', 'KCCR', 'KHWD', 'KLVK', 'KMAE', 'KMCE', 'KMOD', 'KMRY', 'KMYV', 'KNUQ', 'KOAK', 'KOVE', 'KPRB', 'KSAC', 'KSBP', 'KSCK', 'KSFO', 'KSJC', 'KSMF', 'KSNS', 'KSTS', 'KUKI', 'KVCB', 'KWVI' ] # Get NOAA data for desired stattions in a list lines = [] # an array of each read line bucket = "midscapstone-whos-polluting-my-air" s3 = boto3.client('s3') for station in station_list: try: file_name = "AsosRaw/64010{0}20{2}{1}".format( station, month, yr) obj = s3.get_object(Bucket=bucket, Key=file_name) df = pd.read_csv(obj['Body'], header=None) df.columns = ['dataval'] for indx, line in df.iterrows(): lines.append(line['dataval']) except Exception as e: print("*** EXCEPTION IN GET NOAA DATA ITERROWS {}: {}".format( line, e)) # Create noaa dataframe for the month noaa_df = createNOAAdf(lines, '20' + yr + month) # Drop rows where wind speed is not numeric noaa_df = noaa_df[noaa_df.wind_speed != 'T'] merged_noaa_df = pd.merge(noaa_df, unique_station_df, on='wban_number') # Convert data type of numeric columns merged_noaa_df[['wind_speed', 'gust_speed', 'lat', 'lon']] = merged_noaa_df[[ 'wind_speed', 'gust_speed', 'lat', 'lon' ]].apply(pd.to_numeric) # Get data for bounding box bay_noaa_df = merged_noaa_df[(merged_noaa_df.lat > 35) & (merged_noaa_df.lat < 40) & (merged_noaa_df.lon > -125) & (merged_noaa_df.lon < -120)] bay_noaa_df.reset_index(inplace=True, drop=True) bay_noaa_df['datetime'] = bay_noaa_df[[ 'year', 'month', 'day', 'hour', 'minute' ]].apply(lambda x: int(''.join(x)), axis=1) return bay_noaa_df except Exception as e: print("*** EXCEPTION IN GET NOAA DATA *** {}".format(e)) return None
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs): """Determine how to access metadata and break read into ``parts`` This logic is mostly to handle `gather_statistics=False` cases, because this also means we should avoid scanning every file in the dataset. If _metadata is available, set `gather_statistics=True` (if `gather_statistics=None`). The `fast_metadata` output specifies that ParquetFile metadata parsing is fast enough for each worker to perform during `read_partition`. The value will be set to True if: (1) The path is a directory containing _metadta, (2) the path is a list of files containing _metadata, (3) there is only one file to read, or (4) `gather_statistics` is False. In other cases, the ParquetFile object will need to be stored in the task graph, because metadata parsing is too expensive. """ parts = [] fast_metadata = True if len(paths) > 1: base, fns = _analyze_paths(paths, fs) if gather_statistics is not False: # This scans all the files, allowing index/divisions # and filtering if "_metadata" not in fns: paths_use = paths fast_metadata = False else: paths_use = base + fs.sep + "_metadata" pf = ParquetFile(paths_use, open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: if "_metadata" in fns: # We have a _metadata file, lets use it pf = ParquetFile(base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) parts = paths.copy() elif fs.isdir(paths[0]): # This is a directory, check for _metadata, then _common_metadata paths = fs.glob(paths[0] + fs.sep + "*") base, fns = _analyze_paths(paths, fs) if "_metadata" in fns: # Using _metadata file (best-case scenario) pf = ParquetFile(base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) if gather_statistics is None: gather_statistics = True elif gather_statistics is not False: # Scan every file pf = ParquetFile(paths, open_with=fs.open, **kwargs.get("file", {})) fast_metadata = False else: # Use _common_metadata file if it is available. # Otherwise, just use 0th file if "_common_metadata" in fns: pf = ParquetFile(base + fs.sep + "_common_metadata", open_with=fs.open, **kwargs.get("file", {})) else: pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) parts = paths.copy() else: # There is only one file to read base = None pf = ParquetFile(paths[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) return parts, pf, gather_statistics, fast_metadata, base
def read_partition(cls, fs, piece, columns, index, categories=(), pf=None, **kwargs): null_index_name = False if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index if pf is None: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) pf.fn = base if null_index_name and "__index_level_0__" in pf.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index return pf.to_pandas(columns, categories, index=index) else: if isinstance(pf, tuple): if isinstance(pf[0], list): pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1] else: pf = ParquetFile(pf[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed pf.fmd.row_groups = None rg_piece = pf.row_groups[piece] if null_index_name: if "__index_level_0__" in pf.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index pf.fmd.key_value_metadata = None else: pf.fmd.key_value_metadata = None return pf.read_row_group_file(rg_piece, columns, categories, index=index, **kwargs.get("read", {}))
def fetch(filesystem: S3FileSystem, bucket: str, s3_uri: str) -> DataFrame: """Collect a file from S3 URI.""" paths = list_files(filesystem, bucket, s3_uri) parquet = ParquetFile(paths, open_with=filesystem.open) return parquet.to_pandas()
def read_partition(cls, fs, pieces, columns, index, categories=(), **kwargs): null_index_name = False if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index # Use global `parquet_file` object. Need to reattach # the desired row_group parquet_file = kwargs.pop("parquet_file", None) # Always convert pieces to list if not isinstance(pieces, list): pieces = [pieces] sample = pieces[0] if isinstance(sample, tuple): if isinstance(sample[0], str): # We have paths to read from assert parquet_file is None row_groups = [] rg_offset = 0 parquet_file = ParquetFile( [p[0] for p in pieces], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}), ) for piece in pieces: _pf = ( parquet_file if len(pieces) == 1 else ParquetFile( piece[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}), ) ) n_local_row_groups = len(_pf.row_groups) local_rg_indices = piece[1] or list(range(n_local_row_groups)) row_groups += [ parquet_file.row_groups[rg + rg_offset] for rg in local_rg_indices ] rg_offset += n_local_row_groups update_parquet_file = len(row_groups) < len(parquet_file.row_groups) elif parquet_file: row_groups = [] for piece in pieces: # `piece[1]` will contain actual row-group objects, # but they may be pickled rgs = piece[0] if isinstance(rgs, bytes): rgs = pickle.loads(rgs) row_groups += rgs update_parquet_file = True else: raise ValueError("Neither path nor ParquetFile detected!") if update_parquet_file: with _FP_FILE_LOCK: parquet_file.fmd.row_groups = row_groups # NOTE: May lose cats after `_set_attrs` call save_cats = parquet_file.cats parquet_file._set_attrs() parquet_file.cats = save_cats if null_index_name: if "__index_level_0__" in parquet_file.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index parquet_file._dtypes = ( lambda *args: parquet_file.dtypes ) # ugly patch, could be fixed if set(columns).issubset( parquet_file.columns + list(parquet_file.cats.keys()) ): # Convert ParquetFile to pandas return parquet_file.to_pandas( columns=columns, categories=categories, index=index, ) else: # Read necessary row-groups and concatenate dfs = [] for row_group in row_groups: dfs.append( parquet_file.read_row_group_file( row_group, columns, categories, index=index, **kwargs.get("read", {}), ) ) return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0] else: # `piece` is NOT a tuple raise ValueError(f"Expected tuple, got {type(piece)}")
def test_mixed_partition_types(tempdir, partitions): df = pd.DataFrame({'partitions': partitions, 'x': [1, 2]}) write(tempdir, df, file_scheme='hive', partition_on=['partitions']) out = ParquetFile(tempdir).to_pandas() assert (out.sort_values("x").set_index("x").partitions == df.sort_values( "x").set_index("x").partitions).all()
def time_column(): with tmpdir() as tempdir: result = {} fn = join_path('temp.parq') n = 10000000 r = np.random.randint(-1e10, 1e10, n, dtype='int64') d = pd.DataFrame({ 'w': pd.Categorical(np.random.choice(['hi', 'you', 'people'], size=n)), 'x': r.view('timedelta64[ns]'), 'y': r / np.random.randint(1, 1000, size=n), 'z': np.random.randint(0, 127, size=n, dtype=np.uint8) }) d['b'] = r > 0 for col in d.columns: df = d[[col]] write(fn, df) with measure('%s: write, no nulls' % d.dtypes[col], result): write(fn, df, has_nulls=False) #, compression="SNAPPY") pf = ParquetFile(fn) with measure("file open", result): ParquetFile(fn) if col == 'x': assert (df.x.astype('timedelta64[us]') == df.x.astype( 'timedelta64[us]')).all() else: assert (pf.to_pandas() == df).values.all() # warm-up with measure('%s: read, no nulls' % d.dtypes[col], result): pf.to_pandas() with measure('%s: write, no nulls, has_null=True' % d.dtypes[col], result): write(fn, df, has_nulls=True) #, compression="SNAPPY") pf = ParquetFile(fn) if col == 'x': assert (df.x.astype('timedelta64[us]') == df.x.astype( 'timedelta64[us]')).all() else: assert (pf.to_pandas() == df).values.all() # warm-up with measure('%s: read, no nulls, has_null=True' % d.dtypes[col], result): pf.to_pandas() if d.dtypes[col].kind == 'm': d.loc[n // 2, col] = pd.to_datetime('NaT') elif d.dtypes[col].kind == 'f': d.loc[n // 2, col] = np.nan elif d.dtypes[col].kind in ['i', 'u']: continue else: d.loc[n // 2, col] = None with measure('%s: write, with null, has_null=True' % d.dtypes[col], result): write(fn, df, has_nulls=True) #, compression="SNAPPY") pf = ParquetFile(fn) if col == 'x': assert (df.x.astype('timedelta64[us]') == df.x.astype( 'timedelta64[us]')).all() else: assert (pf.to_pandas() == df).values.all() # warm-up with measure('%s: read, with null, has_null=True' % d.dtypes[col], result): pf.to_pandas() with measure( '%s: write, with null, has_null=False' % d.dtypes[col], result): write(fn, df, has_nulls=False) #, compression="SNAPPY") pf = ParquetFile(fn) if col == 'x': assert (df.x.astype('timedelta64[us]') == df.x.astype( 'timedelta64[us]')).all() else: assert (pf.to_pandas() == df).values.all() # warm-up with measure('%s: read, with null, has_null=False' % d.dtypes[col], result): pf.to_pandas() return result
def test_path_containing_metadata_df(): p = ParquetFile(os.path.join(TEST_DATA, "dir_metadata", "empty.parquet")) df = p.to_pandas() assert list(p.columns) == ['a', 'b', 'c', '__index_level_0__'] assert len(df) == 0
if __name__ == "__main__": ROOT_PATH = "C:\\tmp\\" test_file = ROOT_PATH + "postcodes.parquet" write_file = "c:\\tmp\\write.test.parquet" REPEAT = 4 read_times = [] write_gzip_times = [] write_snappy_times = [] write_uncompressed_times = [] print("reading and writing {} {} times...".format(test_file, REPEAT)) for i in range(0, REPEAT): start_time = time.time() pf = ParquetFile(test_file) df = pf.to_pandas() read_times.append(time.time() - start_time) print("file read in {}".format(get_elapsed_time(start_time))) start_time = time.time() write(write_file, df, compression="UNCOMPRESSED") write_uncompressed_times.append(time.time() - start_time) print("written uncompressed") start_time = time.time() write(write_file, df, compression="GZIP") write_gzip_times.append(time.time() - start_time) print("written gzip") start_time = time.time()
def _collect_dataset_info( cls, paths, fs, categories, index, gather_statistics, filters, split_row_groups, chunksize, aggregate_files, ignore_metadata_file, metadata_task_size, parquet_file_extension, kwargs, ): # Define the parquet-file (pf) object to use for metadata, # Also, initialize `parts`. If `parts` is populated here, # then each part will correspond to a file. Otherwise, each part will # correspond to a row group (populated later). # Extract "supported" key-word arguments from `kwargs`. # Split items into `dataset_kwargs` and `read_kwargs` dataset_kwargs, read_kwargs, user_kwargs = _split_user_options( **kwargs) parts = [] _metadata_exists = False if len(paths) == 1 and fs.isdir(paths[0]): # This is a directory. # Check if _metadata and/or _common_metadata files exists base = paths[0] _metadata_exists = True if not ignore_metadata_file: _metadata_exists = fs.isfile(fs.sep.join([base, "_metadata"])) # Find all files if we are not using a _metadata file if ignore_metadata_file or not _metadata_exists: # For now, we need to discover every file under paths[0] paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs) _update_paths = False for fn in ["_metadata", "_common_metadata"]: try: fns.remove(fn) _update_paths = True except ValueError: pass if _update_paths: paths = [fs.sep.join([base, fn]) for fn in fns] _metadata_exists = False if _metadata_exists: # Using _metadata file (best-case scenario) pf = ParquetFile( fs.sep.join([base, "_metadata"]), open_with=fs.open, **dataset_kwargs, ) else: # Use 0th file # Note that "_common_metadata" can cause issues for # partitioned datasets. if parquet_file_extension: # Raise error if all files have been filtered by extension len0 = len(paths) paths = [ path for path in paths if path.endswith(parquet_file_extension) ] if len0 and paths == []: raise ValueError( "No files satisfy the `parquet_file_extension` criteria " f"(files must end with {parquet_file_extension}).") pf = ParquetFile(paths[:1], open_with=fs.open, root=base, **dataset_kwargs) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) if not gather_statistics: parts = [fs.sep.join([base, fn]) for fn in fns] else: # This is a list of files paths, base, fns = _sort_and_analyze_paths(paths, fs) # Check if _metadata is in paths, and # remove it if ignore_metadata_file=True _metadata_exists = "_metadata" in fns if _metadata_exists and ignore_metadata_file: fns.remove("_metadata") _metadata_exists = False paths = [fs.sep.join([base, fn]) for fn in fns] if _metadata_exists: # We have a _metadata file, lets use it pf = ParquetFile( fs.sep.join([base, "_metadata"]), open_with=fs.open, **dataset_kwargs, ) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[:1], open_with=fs.open, root=base, **dataset_kwargs) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) if not gather_statistics: parts = paths.copy() # Check the `aggregate_files` setting aggregation_depth = _get_aggregation_depth( aggregate_files, list(pf.cats), ) # Ensure that there is no overlap between partition columns # and explicit columns in `pf` if pf.cats: _partitions = [p for p in pf.cats if p not in pf.columns] if not _partitions: pf.cats = {} elif len(_partitions) != len(pf.cats): raise ValueError( "No partition-columns should be written in the \n" "file unless they are ALL written in the file.\n" "columns: {} | partitions: {}".format( pf.columns, pf.cats.keys())) return { "pf": pf, "paths": paths, "has_metadata_file": _metadata_exists, "parts": parts, "base": base, "fs": fs, "gather_statistics": gather_statistics, "categories": categories, "index": index, "filters": filters, "split_row_groups": split_row_groups, "chunksize": chunksize, "aggregate_files": aggregate_files, "aggregation_depth": aggregation_depth, "metadata_task_size": metadata_task_size, "kwargs": { "dataset": dataset_kwargs, "read": read_kwargs, **user_kwargs, }, }
from lenskit.algorithms.implicit import BPR from lenskit.batch import MultiEval from lenskit.crossfold import partition_users, SampleN from lenskit import batch, topn, util from tf_idf import tf_idf file = open("pairs_user_new.pickle", "rb") pairs_user = pickle.load(file) truth = pd.concat((p.test for p in pairs_user)) from fastparquet import ParquetFile result = pd.DataFrame() pf = ParquetFile('results/steam/pruned_5_new/recommendations.parquet') for df in pf.iter_row_groups(): trancate = df.loc[df['rank'] < 1001] result = result.append(trancate, sort=False) #result.to_parquet('results/steam/pruned_5_new/recs.parquet') #result.to_csv("results/steam/pruned_5_new/recs.csv") def RR(rec, truth): #recs = pd.read_parquet(file_name) rla = topn.RecListAnalysis() rla.add_metric(topn.recip_rank) RR_result = rla.compute(rec, truth) return RR_result
def _collect_dataset_info( cls, paths, fs, categories, index, gather_statistics, filters, split_row_groups, chunksize, aggregate_files, ignore_metadata_file, metadata_task_size, require_extension=(".parq", ".parquet"), **kwargs, ): # Define the parquet-file (pf) object to use for metadata, # Also, initialize `parts`. If `parts` is populated here, # then each part will correspond to a file. Otherwise, each part will # correspond to a row group (populated later). # # This logic is mostly to handle `gather_statistics=False` cases, # because this also means we should avoid scanning every file in the # dataset. If _metadata is available, set `gather_statistics=True` # (if `gather_statistics=None`). parts = [] _metadata_exists = False if len(paths) == 1 and fs.isdir(paths[0]): # This is a directory. # Check if _metadata and/or _common_metadata files exists base = paths[0] _metadata_exists = True if not ignore_metadata_file: _metadata_exists = fs.isfile(fs.sep.join([base, "_metadata"])) # Find all files if we are not using a _metadata file if ignore_metadata_file or not _metadata_exists: # For now, we need to discover every file under paths[0] paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs) _update_paths = False for fn in ["_metadata", "_common_metadata"]: try: fns.remove(fn) _update_paths = True except ValueError: pass if _update_paths: paths = [fs.sep.join([base, fn]) for fn in fns] _metadata_exists = False if require_extension: paths = [ path for path in paths if path.endswith(require_extension) ] if _metadata_exists: # Using _metadata file (best-case scenario) pf = ParquetFile( fs.sep.join([base, "_metadata"]), open_with=fs.open, **kwargs, ) if gather_statistics is None: gather_statistics = True else: # Use 0th file # Note that "_common_metadata" can cause issues for # partitioned datasets. pf = ParquetFile(paths[:1], open_with=fs.open, root=base, **kwargs) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) if not gather_statistics: parts = [fs.sep.join([base, fn]) for fn in fns] else: # This is a list of files paths, base, fns = _sort_and_analyze_paths(paths, fs) # Check if _metadata is in paths, and # remove it if ignore_metadata_file=True _metadata_exists = "_metadata" in fns if _metadata_exists and ignore_metadata_file: fns.remove("_metadata") _metadata_exists = False if require_extension: fns = [fn for fn in fns if fn.endswith(require_extension)] paths = [fs.sep.join([base, fn]) for fn in fns] if _metadata_exists: # We have a _metadata file, lets use it pf = ParquetFile( fs.sep.join([base, "_metadata"]), open_with=fs.open, **kwargs, ) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[:1], open_with=fs.open, root=base, **kwargs) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) if not gather_statistics: parts = paths.copy() # Check the `aggregate_files` setting aggregation_depth = _get_aggregation_depth( aggregate_files, list(pf.cats), ) # Ensure that there is no overlap between partition columns # and explicit columns in `pf` if pf.cats: _partitions = [p for p in pf.cats if p not in pf.columns] if not _partitions: pf.cats = {} elif len(_partitions) != len(pf.cats): raise ValueError( "No partition-columns should be written in the \n" "file unless they are ALL written in the file.\n" "columns: {} | partitions: {}".format( pf.columns, pf.cats.keys())) return { "pf": pf, "paths": paths, "has_metadata_file": _metadata_exists, "parts": parts, "base": base, "fs": fs, "gather_statistics": gather_statistics, "categories": categories, "index": index, "filters": filters, "split_row_groups": split_row_groups, "chunksize": chunksize, "aggregate_files": aggregate_files, "aggregation_depth": aggregation_depth, "metadata_task_size": metadata_task_size, "kwargs": kwargs, }
def test_empty_row_group(tempdir): fname = os.path.join(tempdir, 'temp.parq') data = pd.DataFrame({'o': np.random.choice(['hello', 'world'], size=1000)}) writer.write(fname, data, row_group_offsets=[0, 900, 1800]) pf = ParquetFile(fname) assert len(pf.row_groups) == 2
def _collect_file_parts( cls, pf_or_files, dataset_info_kwargs, ): # Collect necessary information from dataset_info fs = dataset_info_kwargs["fs"] split_row_groups = dataset_info_kwargs["split_row_groups"] gather_statistics = dataset_info_kwargs["gather_statistics"] stat_col_indices = dataset_info_kwargs["stat_col_indices"] filters = dataset_info_kwargs["filters"] dtypes = dataset_info_kwargs["dtypes"] chunksize = dataset_info_kwargs["chunksize"] aggregation_depth = dataset_info_kwargs["aggregation_depth"] base_path = dataset_info_kwargs.get("base_path", None) root_cats = dataset_info_kwargs.get("root_cats", None) root_file_scheme = dataset_info_kwargs.get("root_file_scheme", None) has_metadata_file = dataset_info_kwargs["has_metadata_file"] # Get ParquetFile if not isinstance(pf_or_files, fastparquet.api.ParquetFile): # Construct local `ParquetFile` object pf = ParquetFile( pf_or_files, open_with=fs.open, root=base_path, ) # Update hive-partitioning to match global cats/scheme pf.cats = root_cats or {} if root_cats: pf.file_scheme = root_file_scheme else: # We already have a ParquetFile object to work with pf = pf_or_files # Organize row-groups by file ( file_row_groups, file_row_group_stats, file_row_group_column_stats, gather_statistics, base_path, ) = cls._organize_row_groups( pf, split_row_groups, gather_statistics, stat_col_indices, filters, dtypes, base_path, has_metadata_file, chunksize, aggregation_depth, ) # Convert organized row-groups to parts parts, stats = _row_groups_to_parts( gather_statistics, split_row_groups, aggregation_depth, file_row_groups, file_row_group_stats, file_row_group_column_stats, stat_col_indices, cls._make_part, make_part_kwargs={ "fs": fs, "pf": pf, "base_path": base_path, "partitions": pf.info.get("partitions", None), }, ) return parts, stats
def get_parquet(name): path = join(STORAGE_PATH, 'parq', '{}.parq'.format(name)) pf = ParquetFile(path) return pf.to_pandas()
import pandas as pd import numpy as np from fastparquet import ParquetFile pf = ParquetFile('adult.parq') df = pf.to_pandas() df.to_csv('adult.csv')
def test_empty_df(): p = ParquetFile(os.path.join(TEST_DATA, "empty.parquet")) df = p.to_pandas() assert list(p.columns) == ['a', 'b', 'c', '__index_level_0__'] assert len(df) == 0
def _determine_pf_parts(fs, paths, gather_statistics, ignore_metadata_file, **kwargs): """Determine how to access metadata and break read into ``parts`` This logic is mostly to handle `gather_statistics=False` cases, because this also means we should avoid scanning every file in the dataset. If _metadata is available, set `gather_statistics=True` (if `gather_statistics=None`). """ parts = [] if len(paths) > 1: paths, base, fns = _sort_and_analyze_paths(paths, fs) # Check if _metadata is in paths, and # remove it if ignore_metadata_file=True _metadata_exists = "_metadata" in fns if _metadata_exists and ignore_metadata_file: fns.remove("_metadata") paths = [fs.sep.join([base, fn]) for fn in fns] _metadata_exists = False if gather_statistics is not False: # This scans all the files, allowing index/divisions # and filtering if _metadata_exists: paths_use = fs.sep.join([base, "_metadata"]) else: paths_use = paths pf = ParquetFile( paths_use, open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}) ) else: if _metadata_exists: # We have a _metadata file, lets use it pf = ParquetFile( fs.sep.join([base, "_metadata"]), open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}), ) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) parts = paths.copy() elif fs.isdir(paths[0]): # This is a directory. # Check if _metadata and/or _common_metadata files exists base = paths[0] _metadata_exists = _common_metadata_exists = True if not ignore_metadata_file: _metadata_exists = fs.isfile(fs.sep.join([base, "_metadata"])) _common_metadata_exists = fs.isfile(fs.sep.join([base, "_common_metadata"])) # Find all files if we are not using a _metadata file if ignore_metadata_file or not _metadata_exists: # For now, we need to discover every file under paths[0] paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs) _common_metadata_exists = "_common_metadata" in fns if "_metadata" in fns: fns.remove("_metadata") paths = [fs.sep.join([base, fn]) for fn in fns] _metadata_exists = False if _metadata_exists: # Using _metadata file (best-case scenario) pf = ParquetFile( fs.sep.join([base, "_metadata"]), open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}), ) if gather_statistics is None: gather_statistics = True elif gather_statistics is not False: # Scan every file pf = ParquetFile(paths, open_with=fs.open, **kwargs.get("file", {})) else: # Use _common_metadata file if it is available. # Otherwise, just use 0th file if _common_metadata_exists: pf = ParquetFile( fs.sep.join([base, "_common_metadata"]), open_with=fs.open, **kwargs.get("file", {}), ) fns.remove("_common_metadata") else: pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) parts = [fs.sep.join([base, fn]) for fn in fns] else: # There is only one file to read base = None pf = ParquetFile( paths[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}) ) # Ensure that there is no overlap between partition columns # and explicit columns in `pf` if pf.cats: _partitions = [p for p in pf.cats if p not in pf.columns] if not _partitions: pf.cats = {} elif len(_partitions) != len(pf.cats): raise ValueError( "No partition-columns should be written in the \n" "file unless they are ALL written in the file.\n" "columns: {} | partitions: {}".format(pf.columns, pf.cats.keys()) ) return parts, pf, gather_statistics, base
############# ## PARQUET ## ############# import pandas as pd from fastparquet import write, ParquetFile from pyspark.sql import SQLContext ## Dummy variables fPath = '/path/to/file1.parq' df1 = pd.DataFrame([[1,'B','C'],[2,'D','E']], columns=list('abc')) # Fastparquet write(fPath, df1, append=False) # Write; Read df2 = ParquetFile(fPath).to_pandas(['a','b'], filters=[('a','in',[1])]) df3 = df2[df2.a==1] # Subset # SqlContext sqlCtx.createDataFrame(df1).write.parquet(fPath) # Write directory df4 = spark.read.parquet(fPath) # Read file | directory ########## ## AVRO ## ########## import pandas as pd from fastavro import writer, reader ## Schema declaration schema = { 'doc' : 'Counting Office Inventory.' ,
metavar='id', type=str, help='Event id of interest') args = parser.parse_args() event_id = args.event_id if os.path.exists('events_with_dma.json'): combined_data = pd.read_json('events_with_dma.json', orient='records').set_index('event_id') find_recommended_events(combined_data, event_id) else: datafiles = [f for f in glob('*.snappy.parquet')] print(datafiles) combined_data = pd.concat([ ParquetFile(filename).to_pandas() for filename in datafiles ]).set_index('event_id') dma = pd.read_csv('DMA-zip.csv').set_index('ZIPCODE') event_dmas = [] for z in combined_data['venue_zip']: if len(z) == 5: if int(z) in dma.index: event_dmas.append(str(dma.loc[int(z), 'DMA CODE'])) else: event_dmas.append(np.nan) else: if int(z[:5]) in dma.index: event_dmas.append(str(dma.loc[int(z[:5]), 'DMA CODE'])) else: event_dmas.append(np.nan)