'--log-interval',
    type=int,
    default=10,
    metavar='N',
    help='how many batches to wait before logging training status')
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

# Load data
project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
data_dir = os.path.join(project_dir, 'data', 'interim', 'data.parq')
pf = ParquetFile(data_dir)
data = pf.to_pandas()

kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(data.values,
                                           batch_size=args.batch_size,
                                           shuffle=True,
                                           **kwargs)
test_loader = torch.utils.data.DataLoader(data.values,
                                          batch_size=args.batch_size,
                                          shuffle=True,
                                          **kwargs)

# Params
noise_variance_var = 0.1
Exemplo n.º 2
0
def test_datetime_partition_no_dupilcates(tempdir, partitions):
    df = pd.DataFrame({'partitions': partitions, 'x': [1, 2]})
    write(tempdir, df, file_scheme='hive', partition_on=['partitions'])
    with pytest.raises(ValueError,
                       match=r'Partition names map to the same value.*'):
        ParquetFile(tempdir)
Exemplo n.º 3
0
def test_unicode_cols(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    df = pd.DataFrame({u"région": [1, 2, 3]})
    write(fn, df)
    pf = ParquetFile(fn)
    pf.to_pandas()
Exemplo n.º 4
0
def time_column():
    with tmpdir() as tempdir:
        result = {}
        fn = join_path(tempdir, 'temp.parq')
        n = 10000000
        r = np.random.randint(-1e10, 1e10, n, dtype='int64')
        d = pd.DataFrame({
            'w':
            pd.Categorical(np.random.choice(['hi', 'you', 'people'], size=n)),
            'x':
            r.view('timedelta64[ns]'),
            'y':
            r / np.random.randint(1, 1000, size=n),
            'z':
            np.random.randint(0, 127, size=n, dtype=np.uint8)
        })

        for col in d.columns:
            df = d[[col]]
            write(fn, df)
            with measure('%s: write, no nulls' % d.dtypes[col], result):
                write(fn, df, has_nulls=False)

            pf = ParquetFile(fn)
            pf.to_pandas(categories={'w': 3})  # warm-up

            with measure('%s: read, no nulls' % d.dtypes[col], result):
                pf.to_pandas(categories={'w': 3})

            with measure('%s: write, no nulls, has_null=True' % d.dtypes[col],
                         result):
                write(fn, df, has_nulls=True)

            pf = ParquetFile(fn)
            pf.to_pandas(categories={'w': 3})  # warm-up

            with measure('%s: read, no nulls, has_null=True' % d.dtypes[col],
                         result):
                pf.to_pandas(categories={'w': 3})

            if d.dtypes[col].kind == 'm':
                d.loc[n // 2, col] = pd.to_datetime('NaT')
            elif d.dtypes[col].kind == 'f':
                d.loc[n // 2, col] = np.nan
            elif d.dtypes[col].kind in ['i', 'u']:
                continue
            else:
                d.loc[n // 2, col] = None
            with measure('%s: write, with null, has_null=True' % d.dtypes[col],
                         result):
                write(fn, df, has_nulls=True)

            pf = ParquetFile(fn)
            pf.to_pandas(categories={'w': 3})  # warm-up

            with measure('%s: read, with null, has_null=True' % d.dtypes[col],
                         result):
                pf.to_pandas(categories={'w': 3})

            with measure(
                    '%s: write, with null, has_null=False' % d.dtypes[col],
                    result):
                write(fn, df, has_nulls=False)

            pf = ParquetFile(fn)
            pf.to_pandas(categories={'w': 3})  # warm-up

            with measure('%s: read, with null, has_null=False' % d.dtypes[col],
                         result):
                pf.to_pandas(categories={'w': 3})

        return result
Exemplo n.º 5
0
def test_auto_null(tempdir):
    tmp = str(tempdir)
    df = pd.DataFrame({
        'a': [1, 2, 3, 0],
        'aa': [1, 2, 3, None],
        'b': [1., 2., 3., np.nan],
        'c': pd.to_timedelta([1, 2, 3, np.nan], unit='ms'),
        'd': ['a', 'b', 'c', None],
        'f': [True, False, True, True],
        'ff': [True, False, None, True]
    })
    df['e'] = df['d'].astype('category')
    df['bb'] = df['b'].astype('object')
    df['aaa'] = df['a'].astype('object')
    object_cols = ['d', 'ff', 'bb', 'aaa']
    test_cols = list(set(df) - set(object_cols)) + ['d']
    fn = os.path.join(tmp, "test.parq")

    with pytest.raises((TypeError, AttributeError)):
        ## TODO: this should be a nicer error?
        write(fn, df, has_nulls=False)

    write(fn, df, has_nulls=True)
    pf = ParquetFile(fn)
    for col in pf._schema[1:]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    df2 = pf.to_pandas(categories=['e'])

    tm.assert_frame_equal(df[test_cols],
                          df2[test_cols],
                          check_categorical=False)
    tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']])
    tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']])
    tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']])

    # not giving any value same as has_nulls=True
    write(fn, df)
    pf = ParquetFile(fn)
    for col in pf._schema[1:]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    df2 = pf.to_pandas(categories=['e'])

    tm.assert_frame_equal(df[test_cols],
                          df2[test_cols],
                          check_categorical=False)
    tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']])
    tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']])
    tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']])

    # 'infer' is new recommended auto-null
    write(fn, df, has_nulls='infer')
    pf = ParquetFile(fn)
    for col in pf._schema[1:]:
        if col.name in object_cols:
            assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
        else:
            assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    df2 = pf.to_pandas()
    tm.assert_frame_equal(df[test_cols],
                          df2[test_cols],
                          check_categorical=False)
    tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']])
    tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']])
    tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']])

    # nut legacy None still works
    write(fn, df, has_nulls=None)
    pf = ParquetFile(fn)
    for col in pf._schema[1:]:
        if col.name in object_cols:
            assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
        else:
            assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    df2 = pf.to_pandas()
    tm.assert_frame_equal(df[test_cols],
                          df2[test_cols],
                          check_categorical=False)
    tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']])
    tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']])
    tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']])
Exemplo n.º 6
0
# encoding: utf-8

"""

@Author: wanghuagang
@Contact: [email protected]
@Project: StudyPython
@File:  d1
@Date: 2019/6/28 下午2:16
@Description:

"""
import pandas as pd
from fastparquet import ParquetFile

parquet_file = 'shunqiwang.parquet'
pf = ParquetFile(parquet_file)
df = pf.to_pandas()  # type: pd.DataFrame

print(df.head(3))
Exemplo n.º 7
0
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs):
    """ Determine how to access metadata and break read into ``parts``

    This logic is mostly to handle `gather_statistics=False` cases,
    because this also means we should avoid scanning every file in the
    dataset.  If _metadata is available, set `gather_statistics=True`
    (if `gather_statistics=None`).
    """
    parts = []
    if len(paths) > 1:
        if gather_statistics is not False:
            # This scans all the files, allowing index/divisions
            # and filtering
            pf = ParquetFile(paths,
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))
        else:
            base, fns = _analyze_paths(paths, fs)
            relpaths = [path.replace(base, "").lstrip("/") for path in paths]
            if "_metadata" in relpaths:
                # We have a _metadata file, lets use it
                pf = ParquetFile(base + fs.sep + "_metadata",
                                 open_with=fs.open,
                                 sep=fs.sep,
                                 **kwargs.get("file", {}))
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
                pf.file_scheme = scheme
                pf.cats = _paths_to_cats(fns, scheme)
                parts = paths.copy()
    else:
        if fs.isdir(paths[0]):
            # This is a directory, check for _metadata, then _common_metadata
            paths = fs.glob(paths[0] + fs.sep + "*")
            base, fns = _analyze_paths(paths, fs)
            relpaths = [path.replace(base, "").lstrip("/") for path in paths]
            if "_metadata" in relpaths:
                # Using _metadata file (best-case scenario)
                pf = ParquetFile(base + fs.sep + "_metadata",
                                 open_with=fs.open,
                                 sep=fs.sep,
                                 **kwargs.get("file", {}))
                if gather_statistics is None:
                    gather_statistics = True

            elif gather_statistics is not False:
                # Scan every file
                pf = ParquetFile(paths,
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
            else:
                # Use _common_metadata file if it is available.
                # Otherwise, just use 0th file
                if "_common_metadata" in relpaths:
                    pf = ParquetFile(base + fs.sep + "_common_metadata",
                                     open_with=fs.open,
                                     **kwargs.get("file", {}))
                else:
                    pf = ParquetFile(paths[0],
                                     open_with=fs.open,
                                     **kwargs.get("file", {}))
                scheme = get_file_scheme(fns)
                pf.file_scheme = scheme
                pf.cats = _paths_to_cats(fns, scheme)
                parts = paths.copy()
        else:
            # There is only one file to read
            pf = ParquetFile(paths[0],
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))

    return parts, pf, gather_statistics
Exemplo n.º 8
0
    def read_partition(cls, fs, piece, columns, index, categories=(),
                       **kwargs):

        null_index_name = False
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        # Use global `parquet_file` object.  Need to reattach
        # the desired row_group
        parquet_file = kwargs.pop("parquet_file", None)

        if isinstance(piece, tuple):
            if isinstance(piece[0], str):
                # We have a path to read from
                assert parquet_file is None
                parquet_file = ParquetFile(piece[0],
                                           open_with=fs.open,
                                           sep=fs.sep,
                                           **kwargs.get("file", {}))
                rg_indices = piece[1] or list(
                    range(len(parquet_file.row_groups)))

                # `piece[1]` will contain row-group indices
                row_groups = [parquet_file.row_groups[rg] for rg in rg_indices]
            elif parquet_file:
                # `piece[1]` will contain actual row-group objects,
                # but they may be pickled
                row_groups = piece[0]
                if isinstance(row_groups, bytes):
                    row_groups = pickle.loads(row_groups)
                parquet_file.fmd.row_groups = row_groups
                # NOTE: May lose cats after `_set_attrs` call
                save_cats = parquet_file.cats
                parquet_file._set_attrs()
                parquet_file.cats = save_cats
            else:
                raise ValueError("Neither path nor ParquetFile detected!")

            if null_index_name:
                if "__index_level_0__" in parquet_file.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index

            parquet_file._dtypes = (lambda *args: parquet_file.dtypes
                                    )  # ugly patch, could be fixed

            # Read necessary row-groups and concatenate
            dfs = []
            for row_group in row_groups:
                dfs.append(
                    parquet_file.read_row_group_file(
                        row_group,
                        columns,
                        categories,
                        index=index,
                        **kwargs.get("read", {}),
                    ))
            return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0]

        else:
            # `piece` is NOT a tuple
            raise ValueError(f"Expected tuple, got {type(piece)}")
Exemplo n.º 9
0
def test_write_index_false(tempdir):
    fn = os.path.join(tempdir, 'test.parquet')
    df = pd.DataFrame(0, columns=['a'], index=range(1, 3))
    write(fn, df, write_index=False)
    rec_df = ParquetFile(fn).to_pandas()
    assert rec_df.index[0] == 0
Exemplo n.º 10
0
def load_instances(parq_file, motifs=None, dedup=True, verbose=True):
    """Load pattern instances from the parquet file

    Args:
      parq_file: parquet file of motif instances
      motifs: dictionary of motifs of interest.
        key=custom motif name, value=short pattern name (e.g. {'Nanog': 'm0_p3'})

    """
    if motifs is not None:
        incl_motifs = {longer_pattern(m) for m in motifs.values()}
    else:
        incl_motifs = None

    if isinstance(parq_file, pd.DataFrame):
        dfi = parq_file
    else:
        if motifs is not None:
            from fastparquet import ParquetFile

            # Selectively load only the relevant patterns
            pf = ParquetFile(str(parq_file))
            patterns = [shorten_pattern(pn) for pn in incl_motifs]
            dfi = pf.to_pandas(filters=[("pattern_short", "in", patterns)])
        else:
            dfi = pd.read_parquet(str(parq_file), engine='fastparquet')
            if 'pattern' not in dfi:
                # assumes a hive-stored file
                dfi['pattern'] = dfi['dir0'].str.replace(
                    "pattern=", "").astype(str) + "/" + dfi['dir1'].astype(str)

    # filter
    if motifs is not None:
        dfi = dfi[dfi.pattern.isin(
            incl_motifs)]  # NOTE this should already be removed
        if 'pattern_short' not in dfi:
            dfi['pattern_short'] = dfi['pattern'].map(
                {k: shorten_pattern(k)
                 for k in incl_motifs})
        dfi['pattern_name'] = dfi['pattern_short'].map(
            {v: k
             for k, v in motifs.items()})
    else:
        dfi['pattern_short'] = dfi['pattern'].map(
            {k: shorten_pattern(k)
             for k in dfi.pattern.unique()})

    # add some columns if they don't yet exist
    if 'pattern_start_abs' not in dfi:
        dfi['pattern_start_abs'] = dfi['example_start'] + dfi['pattern_start']
    if 'pattern_end_abs' not in dfi:
        dfi['pattern_end_abs'] = dfi['example_start'] + dfi['pattern_end']

    if dedup:
        # deduplicate
        dfi_dedup = dfi.drop_duplicates([
            'pattern', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs',
            'strand'
        ])

        # number of removed duplicates
        d = len(dfi) - len(dfi_dedup)
        if verbose:
            print("number of de-duplicated instances:", d,
                  f"({d / len(dfi) * 100}%)")

        # use de-duplicated instances from now on
        dfi = dfi_dedup
    return dfi
Exemplo n.º 11
0
def getNOAAData(month, yr):
    """Function to get noaa data for the month"""

    # Read station data from file that was stored in s3
    try:
        try:
            s3 = s3fs.S3FileSystem()
            myopen = s3.open
            s3_resource = boto3.resource('s3')
            s3_resource.Object('midscapstone-whos-polluting-my-air',
                               'UtilFiles/uniq_station_data.parquet').load()
            pf = ParquetFile(
                'midscapstone-whos-polluting-my-air/UtilFiles/uniq_station_data.parquet',
                open_with=myopen)
            unique_station_df = pf.to_pandas()
        except:
            raise CustomError("FILE ERROR: Unique Station Dataframe not found")

        # List of NOAA stations in the 35 < lat < 40 and  -125 < lon < -120 bounding box
        station_list = [
            'KAPC', 'KBLU', 'KCCR', 'KHWD', 'KLVK', 'KMAE', 'KMCE', 'KMOD',
            'KMRY', 'KMYV', 'KNUQ', 'KOAK', 'KOVE', 'KPRB', 'KSAC', 'KSBP',
            'KSCK', 'KSFO', 'KSJC', 'KSMF', 'KSNS', 'KSTS', 'KUKI', 'KVCB',
            'KWVI'
        ]

        # Get NOAA data for desired stattions in a list
        lines = []  # an array of each read line
        bucket = "midscapstone-whos-polluting-my-air"
        s3 = boto3.client('s3')
        for station in station_list:
            try:
                file_name = "AsosRaw/64010{0}20{2}{1}".format(
                    station, month, yr)
                obj = s3.get_object(Bucket=bucket, Key=file_name)
                df = pd.read_csv(obj['Body'], header=None)
                df.columns = ['dataval']
                for indx, line in df.iterrows():
                    lines.append(line['dataval'])
            except Exception as e:
                print("*** EXCEPTION IN GET NOAA DATA ITERROWS {}: {}".format(
                    line, e))

        # Create noaa dataframe for the month
        noaa_df = createNOAAdf(lines, '20' + yr + month)

        # Drop rows where wind speed is not numeric
        noaa_df = noaa_df[noaa_df.wind_speed != 'T']
        merged_noaa_df = pd.merge(noaa_df, unique_station_df, on='wban_number')
        # Convert data type of numeric columns
        merged_noaa_df[['wind_speed', 'gust_speed', 'lat',
                        'lon']] = merged_noaa_df[[
                            'wind_speed', 'gust_speed', 'lat', 'lon'
                        ]].apply(pd.to_numeric)

        # Get data for bounding box
        bay_noaa_df = merged_noaa_df[(merged_noaa_df.lat > 35)
                                     & (merged_noaa_df.lat < 40)
                                     & (merged_noaa_df.lon > -125) &
                                     (merged_noaa_df.lon < -120)]
        bay_noaa_df.reset_index(inplace=True, drop=True)
        bay_noaa_df['datetime'] = bay_noaa_df[[
            'year', 'month', 'day', 'hour', 'minute'
        ]].apply(lambda x: int(''.join(x)), axis=1)

        return bay_noaa_df
    except Exception as e:
        print("*** EXCEPTION IN GET NOAA DATA *** {}".format(e))
        return None
Exemplo n.º 12
0
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs):
    """Determine how to access metadata and break read into ``parts``

    This logic is mostly to handle `gather_statistics=False` cases,
    because this also means we should avoid scanning every file in the
    dataset.  If _metadata is available, set `gather_statistics=True`
    (if `gather_statistics=None`).

    The `fast_metadata` output specifies that ParquetFile metadata parsing
    is fast enough for each worker to perform during `read_partition`. The
    value will be set to True if: (1) The path is a directory containing
    _metadta, (2) the path is a list of files containing _metadata, (3)
    there is only one file to read, or (4) `gather_statistics` is False.
    In other cases, the ParquetFile object will need to be stored in the
    task graph, because metadata parsing is too expensive.
    """
    parts = []
    fast_metadata = True
    if len(paths) > 1:
        base, fns = _analyze_paths(paths, fs)
        if gather_statistics is not False:
            # This scans all the files, allowing index/divisions
            # and filtering
            if "_metadata" not in fns:
                paths_use = paths
                fast_metadata = False
            else:
                paths_use = base + fs.sep + "_metadata"
            pf = ParquetFile(paths_use,
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))
        else:
            if "_metadata" in fns:
                # We have a _metadata file, lets use it
                pf = ParquetFile(base + fs.sep + "_metadata",
                                 open_with=fs.open,
                                 sep=fs.sep,
                                 **kwargs.get("file", {}))
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                parts = paths.copy()
    elif fs.isdir(paths[0]):
        # This is a directory, check for _metadata, then _common_metadata
        paths = fs.glob(paths[0] + fs.sep + "*")
        base, fns = _analyze_paths(paths, fs)
        if "_metadata" in fns:
            # Using _metadata file (best-case scenario)
            pf = ParquetFile(base + fs.sep + "_metadata",
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))
            if gather_statistics is None:
                gather_statistics = True

        elif gather_statistics is not False:
            # Scan every file
            pf = ParquetFile(paths,
                             open_with=fs.open,
                             **kwargs.get("file", {}))
            fast_metadata = False
        else:
            # Use _common_metadata file if it is available.
            # Otherwise, just use 0th file
            if "_common_metadata" in fns:
                pf = ParquetFile(base + fs.sep + "_common_metadata",
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
            else:
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
            scheme = get_file_scheme(fns)
            pf.file_scheme = scheme
            pf.cats = paths_to_cats(fns, scheme)
            parts = paths.copy()
    else:
        # There is only one file to read
        base = None
        pf = ParquetFile(paths[0],
                         open_with=fs.open,
                         sep=fs.sep,
                         **kwargs.get("file", {}))

    return parts, pf, gather_statistics, fast_metadata, base
Exemplo n.º 13
0
    def read_partition(cls,
                       fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       pf=None,
                       **kwargs):

        null_index_name = False
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        if pf is None:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = paths_to_cats(fns, scheme)
            pf.fn = base
            if null_index_name and "__index_level_0__" in pf.columns:
                # See "Handling a None-labeled index" comment above
                index = ["__index_level_0__"]
                columns += index
            return pf.to_pandas(columns, categories, index=index)
        else:
            if isinstance(pf, tuple):
                if isinstance(pf[0], list):
                    pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1]
                else:
                    pf = ParquetFile(pf[0],
                                     open_with=fs.open,
                                     sep=fs.sep,
                                     **kwargs.get("file", {}))
                pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
                pf.fmd.row_groups = None
            rg_piece = pf.row_groups[piece]
            if null_index_name:
                if "__index_level_0__" in pf.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index
                    pf.fmd.key_value_metadata = None
            else:
                pf.fmd.key_value_metadata = None
            return pf.read_row_group_file(rg_piece,
                                          columns,
                                          categories,
                                          index=index,
                                          **kwargs.get("read", {}))
Exemplo n.º 14
0
def fetch(filesystem: S3FileSystem, bucket: str, s3_uri: str) -> DataFrame:
    """Collect a file from S3 URI."""
    paths = list_files(filesystem, bucket, s3_uri)
    parquet = ParquetFile(paths, open_with=filesystem.open)

    return parquet.to_pandas()
Exemplo n.º 15
0
    def read_partition(cls, fs, pieces, columns, index, categories=(), **kwargs):

        null_index_name = False
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        # Use global `parquet_file` object.  Need to reattach
        # the desired row_group
        parquet_file = kwargs.pop("parquet_file", None)

        # Always convert pieces to list
        if not isinstance(pieces, list):
            pieces = [pieces]

        sample = pieces[0]
        if isinstance(sample, tuple):
            if isinstance(sample[0], str):
                # We have paths to read from
                assert parquet_file is None

                row_groups = []
                rg_offset = 0
                parquet_file = ParquetFile(
                    [p[0] for p in pieces],
                    open_with=fs.open,
                    sep=fs.sep,
                    **kwargs.get("file", {}),
                )
                for piece in pieces:
                    _pf = (
                        parquet_file
                        if len(pieces) == 1
                        else ParquetFile(
                            piece[0],
                            open_with=fs.open,
                            sep=fs.sep,
                            **kwargs.get("file", {}),
                        )
                    )
                    n_local_row_groups = len(_pf.row_groups)
                    local_rg_indices = piece[1] or list(range(n_local_row_groups))
                    row_groups += [
                        parquet_file.row_groups[rg + rg_offset]
                        for rg in local_rg_indices
                    ]
                    rg_offset += n_local_row_groups
                update_parquet_file = len(row_groups) < len(parquet_file.row_groups)

            elif parquet_file:

                row_groups = []
                for piece in pieces:
                    # `piece[1]` will contain actual row-group objects,
                    # but they may be pickled
                    rgs = piece[0]
                    if isinstance(rgs, bytes):
                        rgs = pickle.loads(rgs)
                    row_groups += rgs
                update_parquet_file = True

            else:
                raise ValueError("Neither path nor ParquetFile detected!")

            if update_parquet_file:
                with _FP_FILE_LOCK:
                    parquet_file.fmd.row_groups = row_groups
                    # NOTE: May lose cats after `_set_attrs` call
                    save_cats = parquet_file.cats
                    parquet_file._set_attrs()
                    parquet_file.cats = save_cats

            if null_index_name:
                if "__index_level_0__" in parquet_file.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index

            parquet_file._dtypes = (
                lambda *args: parquet_file.dtypes
            )  # ugly patch, could be fixed

            if set(columns).issubset(
                parquet_file.columns + list(parquet_file.cats.keys())
            ):
                # Convert ParquetFile to pandas
                return parquet_file.to_pandas(
                    columns=columns,
                    categories=categories,
                    index=index,
                )
            else:
                # Read necessary row-groups and concatenate
                dfs = []
                for row_group in row_groups:
                    dfs.append(
                        parquet_file.read_row_group_file(
                            row_group,
                            columns,
                            categories,
                            index=index,
                            **kwargs.get("read", {}),
                        )
                    )
                return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0]

        else:
            # `piece` is NOT a tuple
            raise ValueError(f"Expected tuple, got {type(piece)}")
Exemplo n.º 16
0
def test_mixed_partition_types(tempdir, partitions):
    df = pd.DataFrame({'partitions': partitions, 'x': [1, 2]})
    write(tempdir, df, file_scheme='hive', partition_on=['partitions'])
    out = ParquetFile(tempdir).to_pandas()
    assert (out.sort_values("x").set_index("x").partitions == df.sort_values(
        "x").set_index("x").partitions).all()
Exemplo n.º 17
0
def time_column():
    with tmpdir() as tempdir:
        result = {}
        fn = join_path('temp.parq')
        n = 10000000
        r = np.random.randint(-1e10, 1e10, n, dtype='int64')
        d = pd.DataFrame({
            'w':
            pd.Categorical(np.random.choice(['hi', 'you', 'people'], size=n)),
            'x':
            r.view('timedelta64[ns]'),
            'y':
            r / np.random.randint(1, 1000, size=n),
            'z':
            np.random.randint(0, 127, size=n, dtype=np.uint8)
        })
        d['b'] = r > 0

        for col in d.columns:
            df = d[[col]]
            write(fn, df)
            with measure('%s: write, no nulls' % d.dtypes[col], result):
                write(fn, df, has_nulls=False)  #, compression="SNAPPY")

            pf = ParquetFile(fn)
            with measure("file open", result):
                ParquetFile(fn)

            if col == 'x':
                assert (df.x.astype('timedelta64[us]') == df.x.astype(
                    'timedelta64[us]')).all()
            else:
                assert (pf.to_pandas() == df).values.all()  # warm-up

            with measure('%s: read, no nulls' % d.dtypes[col], result):
                pf.to_pandas()

            with measure('%s: write, no nulls, has_null=True' % d.dtypes[col],
                         result):
                write(fn, df, has_nulls=True)  #, compression="SNAPPY")

            pf = ParquetFile(fn)
            if col == 'x':
                assert (df.x.astype('timedelta64[us]') == df.x.astype(
                    'timedelta64[us]')).all()
            else:
                assert (pf.to_pandas() == df).values.all()  # warm-up

            with measure('%s: read, no nulls, has_null=True' % d.dtypes[col],
                         result):
                pf.to_pandas()

            if d.dtypes[col].kind == 'm':
                d.loc[n // 2, col] = pd.to_datetime('NaT')
            elif d.dtypes[col].kind == 'f':
                d.loc[n // 2, col] = np.nan
            elif d.dtypes[col].kind in ['i', 'u']:
                continue
            else:
                d.loc[n // 2, col] = None
            with measure('%s: write, with null, has_null=True' % d.dtypes[col],
                         result):
                write(fn, df, has_nulls=True)  #, compression="SNAPPY")

            pf = ParquetFile(fn)
            if col == 'x':
                assert (df.x.astype('timedelta64[us]') == df.x.astype(
                    'timedelta64[us]')).all()
            else:
                assert (pf.to_pandas() == df).values.all()  # warm-up

            with measure('%s: read, with null, has_null=True' % d.dtypes[col],
                         result):
                pf.to_pandas()

            with measure(
                    '%s: write, with null, has_null=False' % d.dtypes[col],
                    result):
                write(fn, df, has_nulls=False)  #, compression="SNAPPY")

            pf = ParquetFile(fn)
            if col == 'x':
                assert (df.x.astype('timedelta64[us]') == df.x.astype(
                    'timedelta64[us]')).all()
            else:
                assert (pf.to_pandas() == df).values.all()  # warm-up

            with measure('%s: read, with null, has_null=False' % d.dtypes[col],
                         result):
                pf.to_pandas()

        return result
Exemplo n.º 18
0
def test_path_containing_metadata_df():
    p = ParquetFile(os.path.join(TEST_DATA, "dir_metadata", "empty.parquet"))
    df = p.to_pandas()
    assert list(p.columns) == ['a', 'b', 'c', '__index_level_0__']
    assert len(df) == 0
Exemplo n.º 19
0
if __name__ == "__main__":
    ROOT_PATH = "C:\\tmp\\"
    test_file = ROOT_PATH + "postcodes.parquet"
    write_file = "c:\\tmp\\write.test.parquet"
    REPEAT = 4

    read_times = []
    write_gzip_times = []
    write_snappy_times = []
    write_uncompressed_times = []

    print("reading and writing {} {} times...".format(test_file, REPEAT))
    for i in range(0, REPEAT):
        start_time = time.time()
        pf = ParquetFile(test_file)
        df = pf.to_pandas()
        read_times.append(time.time() - start_time)
        print("file read in {}".format(get_elapsed_time(start_time)))

        start_time = time.time()
        write(write_file, df, compression="UNCOMPRESSED")
        write_uncompressed_times.append(time.time() - start_time)
        print("written uncompressed")

        start_time = time.time()
        write(write_file, df, compression="GZIP")
        write_gzip_times.append(time.time() - start_time)
        print("written gzip")

        start_time = time.time()
Exemplo n.º 20
0
    def _collect_dataset_info(
        cls,
        paths,
        fs,
        categories,
        index,
        gather_statistics,
        filters,
        split_row_groups,
        chunksize,
        aggregate_files,
        ignore_metadata_file,
        metadata_task_size,
        parquet_file_extension,
        kwargs,
    ):

        # Define the parquet-file (pf) object to use for metadata,
        # Also, initialize `parts`.  If `parts` is populated here,
        # then each part will correspond to a file.  Otherwise, each part will
        # correspond to a row group (populated later).

        # Extract "supported" key-word arguments from `kwargs`.
        # Split items into `dataset_kwargs` and `read_kwargs`
        dataset_kwargs, read_kwargs, user_kwargs = _split_user_options(
            **kwargs)

        parts = []
        _metadata_exists = False
        if len(paths) == 1 and fs.isdir(paths[0]):

            # This is a directory.
            # Check if _metadata and/or _common_metadata files exists
            base = paths[0]
            _metadata_exists = True
            if not ignore_metadata_file:
                _metadata_exists = fs.isfile(fs.sep.join([base, "_metadata"]))

            # Find all files if we are not using a _metadata file
            if ignore_metadata_file or not _metadata_exists:
                # For now, we need to discover every file under paths[0]
                paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs)
                _update_paths = False
                for fn in ["_metadata", "_common_metadata"]:
                    try:
                        fns.remove(fn)
                        _update_paths = True
                    except ValueError:
                        pass
                if _update_paths:
                    paths = [fs.sep.join([base, fn]) for fn in fns]
                _metadata_exists = False
            if _metadata_exists:
                # Using _metadata file (best-case scenario)
                pf = ParquetFile(
                    fs.sep.join([base, "_metadata"]),
                    open_with=fs.open,
                    **dataset_kwargs,
                )
            else:
                # Use 0th file
                # Note that "_common_metadata" can cause issues for
                # partitioned datasets.
                if parquet_file_extension:
                    # Raise error if all files have been filtered by extension
                    len0 = len(paths)
                    paths = [
                        path for path in paths
                        if path.endswith(parquet_file_extension)
                    ]
                    if len0 and paths == []:
                        raise ValueError(
                            "No files satisfy the `parquet_file_extension` criteria "
                            f"(files must end with {parquet_file_extension}).")
                pf = ParquetFile(paths[:1],
                                 open_with=fs.open,
                                 root=base,
                                 **dataset_kwargs)
                scheme = get_file_scheme(fns)
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                if not gather_statistics:
                    parts = [fs.sep.join([base, fn]) for fn in fns]
        else:
            # This is a list of files
            paths, base, fns = _sort_and_analyze_paths(paths, fs)

            # Check if _metadata is in paths, and
            # remove it if ignore_metadata_file=True
            _metadata_exists = "_metadata" in fns
            if _metadata_exists and ignore_metadata_file:
                fns.remove("_metadata")
                _metadata_exists = False
            paths = [fs.sep.join([base, fn]) for fn in fns]

            if _metadata_exists:
                # We have a _metadata file, lets use it
                pf = ParquetFile(
                    fs.sep.join([base, "_metadata"]),
                    open_with=fs.open,
                    **dataset_kwargs,
                )
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[:1],
                                 open_with=fs.open,
                                 root=base,
                                 **dataset_kwargs)
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                if not gather_statistics:
                    parts = paths.copy()

        # Check the `aggregate_files` setting
        aggregation_depth = _get_aggregation_depth(
            aggregate_files,
            list(pf.cats),
        )

        # Ensure that there is no overlap between partition columns
        # and explicit columns in `pf`
        if pf.cats:
            _partitions = [p for p in pf.cats if p not in pf.columns]
            if not _partitions:
                pf.cats = {}
            elif len(_partitions) != len(pf.cats):
                raise ValueError(
                    "No partition-columns should be written in the \n"
                    "file unless they are ALL written in the file.\n"
                    "columns: {} | partitions: {}".format(
                        pf.columns, pf.cats.keys()))

        return {
            "pf": pf,
            "paths": paths,
            "has_metadata_file": _metadata_exists,
            "parts": parts,
            "base": base,
            "fs": fs,
            "gather_statistics": gather_statistics,
            "categories": categories,
            "index": index,
            "filters": filters,
            "split_row_groups": split_row_groups,
            "chunksize": chunksize,
            "aggregate_files": aggregate_files,
            "aggregation_depth": aggregation_depth,
            "metadata_task_size": metadata_task_size,
            "kwargs": {
                "dataset": dataset_kwargs,
                "read": read_kwargs,
                **user_kwargs,
            },
        }
Exemplo n.º 21
0
from lenskit.algorithms.implicit import BPR

from lenskit.batch import MultiEval
from lenskit.crossfold import partition_users, SampleN
from lenskit import batch, topn, util
from tf_idf import tf_idf

file = open("pairs_user_new.pickle", "rb")
pairs_user = pickle.load(file)

truth = pd.concat((p.test for p in pairs_user))

from fastparquet import ParquetFile

result = pd.DataFrame()
pf = ParquetFile('results/steam/pruned_5_new/recommendations.parquet')
for df in pf.iter_row_groups():
    trancate = df.loc[df['rank'] < 1001]
    result = result.append(trancate, sort=False)

#result.to_parquet('results/steam/pruned_5_new/recs.parquet')
#result.to_csv("results/steam/pruned_5_new/recs.csv")


def RR(rec, truth):
    #recs = pd.read_parquet(file_name)
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.recip_rank)
    RR_result = rla.compute(rec, truth)
    return RR_result
Exemplo n.º 22
0
    def _collect_dataset_info(
            cls,
            paths,
            fs,
            categories,
            index,
            gather_statistics,
            filters,
            split_row_groups,
            chunksize,
            aggregate_files,
            ignore_metadata_file,
            metadata_task_size,
            require_extension=(".parq", ".parquet"),
            **kwargs,
    ):

        # Define the parquet-file (pf) object to use for metadata,
        # Also, initialize `parts`.  If `parts` is populated here,
        # then each part will correspond to a file.  Otherwise, each part will
        # correspond to a row group (populated later).
        #
        # This logic is mostly to handle `gather_statistics=False` cases,
        # because this also means we should avoid scanning every file in the
        # dataset.  If _metadata is available, set `gather_statistics=True`
        # (if `gather_statistics=None`).

        parts = []
        _metadata_exists = False
        if len(paths) == 1 and fs.isdir(paths[0]):

            # This is a directory.
            # Check if _metadata and/or _common_metadata files exists
            base = paths[0]
            _metadata_exists = True
            if not ignore_metadata_file:
                _metadata_exists = fs.isfile(fs.sep.join([base, "_metadata"]))

            # Find all files if we are not using a _metadata file
            if ignore_metadata_file or not _metadata_exists:
                # For now, we need to discover every file under paths[0]
                paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs)
                _update_paths = False
                for fn in ["_metadata", "_common_metadata"]:
                    try:
                        fns.remove(fn)
                        _update_paths = True
                    except ValueError:
                        pass
                if _update_paths:
                    paths = [fs.sep.join([base, fn]) for fn in fns]
                _metadata_exists = False

            if require_extension:
                paths = [
                    path for path in paths if path.endswith(require_extension)
                ]
            if _metadata_exists:
                # Using _metadata file (best-case scenario)
                pf = ParquetFile(
                    fs.sep.join([base, "_metadata"]),
                    open_with=fs.open,
                    **kwargs,
                )
                if gather_statistics is None:
                    gather_statistics = True
            else:
                # Use 0th file
                # Note that "_common_metadata" can cause issues for
                # partitioned datasets.
                pf = ParquetFile(paths[:1],
                                 open_with=fs.open,
                                 root=base,
                                 **kwargs)
                scheme = get_file_scheme(fns)
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                if not gather_statistics:
                    parts = [fs.sep.join([base, fn]) for fn in fns]
        else:
            # This is a list of files
            paths, base, fns = _sort_and_analyze_paths(paths, fs)

            # Check if _metadata is in paths, and
            # remove it if ignore_metadata_file=True
            _metadata_exists = "_metadata" in fns
            if _metadata_exists and ignore_metadata_file:
                fns.remove("_metadata")
                _metadata_exists = False
            if require_extension:
                fns = [fn for fn in fns if fn.endswith(require_extension)]
            paths = [fs.sep.join([base, fn]) for fn in fns]

            if _metadata_exists:
                # We have a _metadata file, lets use it
                pf = ParquetFile(
                    fs.sep.join([base, "_metadata"]),
                    open_with=fs.open,
                    **kwargs,
                )
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[:1],
                                 open_with=fs.open,
                                 root=base,
                                 **kwargs)
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                if not gather_statistics:
                    parts = paths.copy()

        # Check the `aggregate_files` setting
        aggregation_depth = _get_aggregation_depth(
            aggregate_files,
            list(pf.cats),
        )

        # Ensure that there is no overlap between partition columns
        # and explicit columns in `pf`
        if pf.cats:
            _partitions = [p for p in pf.cats if p not in pf.columns]
            if not _partitions:
                pf.cats = {}
            elif len(_partitions) != len(pf.cats):
                raise ValueError(
                    "No partition-columns should be written in the \n"
                    "file unless they are ALL written in the file.\n"
                    "columns: {} | partitions: {}".format(
                        pf.columns, pf.cats.keys()))

        return {
            "pf": pf,
            "paths": paths,
            "has_metadata_file": _metadata_exists,
            "parts": parts,
            "base": base,
            "fs": fs,
            "gather_statistics": gather_statistics,
            "categories": categories,
            "index": index,
            "filters": filters,
            "split_row_groups": split_row_groups,
            "chunksize": chunksize,
            "aggregate_files": aggregate_files,
            "aggregation_depth": aggregation_depth,
            "metadata_task_size": metadata_task_size,
            "kwargs": kwargs,
        }
Exemplo n.º 23
0
def test_empty_row_group(tempdir):
    fname = os.path.join(tempdir, 'temp.parq')
    data = pd.DataFrame({'o': np.random.choice(['hello', 'world'], size=1000)})
    writer.write(fname, data, row_group_offsets=[0, 900, 1800])
    pf = ParquetFile(fname)
    assert len(pf.row_groups) == 2
Exemplo n.º 24
0
    def _collect_file_parts(
        cls,
        pf_or_files,
        dataset_info_kwargs,
    ):

        # Collect necessary information from dataset_info
        fs = dataset_info_kwargs["fs"]
        split_row_groups = dataset_info_kwargs["split_row_groups"]
        gather_statistics = dataset_info_kwargs["gather_statistics"]
        stat_col_indices = dataset_info_kwargs["stat_col_indices"]
        filters = dataset_info_kwargs["filters"]
        dtypes = dataset_info_kwargs["dtypes"]
        chunksize = dataset_info_kwargs["chunksize"]
        aggregation_depth = dataset_info_kwargs["aggregation_depth"]
        base_path = dataset_info_kwargs.get("base_path", None)
        root_cats = dataset_info_kwargs.get("root_cats", None)
        root_file_scheme = dataset_info_kwargs.get("root_file_scheme", None)
        has_metadata_file = dataset_info_kwargs["has_metadata_file"]

        # Get ParquetFile
        if not isinstance(pf_or_files, fastparquet.api.ParquetFile):
            # Construct local `ParquetFile` object
            pf = ParquetFile(
                pf_or_files,
                open_with=fs.open,
                root=base_path,
            )
            # Update hive-partitioning to match global cats/scheme
            pf.cats = root_cats or {}
            if root_cats:
                pf.file_scheme = root_file_scheme
        else:
            # We already have a ParquetFile object to work with
            pf = pf_or_files

        # Organize row-groups by file
        (
            file_row_groups,
            file_row_group_stats,
            file_row_group_column_stats,
            gather_statistics,
            base_path,
        ) = cls._organize_row_groups(
            pf,
            split_row_groups,
            gather_statistics,
            stat_col_indices,
            filters,
            dtypes,
            base_path,
            has_metadata_file,
            chunksize,
            aggregation_depth,
        )

        # Convert organized row-groups to parts
        parts, stats = _row_groups_to_parts(
            gather_statistics,
            split_row_groups,
            aggregation_depth,
            file_row_groups,
            file_row_group_stats,
            file_row_group_column_stats,
            stat_col_indices,
            cls._make_part,
            make_part_kwargs={
                "fs": fs,
                "pf": pf,
                "base_path": base_path,
                "partitions": pf.info.get("partitions", None),
            },
        )

        return parts, stats
Exemplo n.º 25
0
def get_parquet(name):
    path = join(STORAGE_PATH, 'parq', '{}.parq'.format(name))
    pf = ParquetFile(path)
    return pf.to_pandas()
Exemplo n.º 26
0
import pandas as pd
import numpy as np
from fastparquet import ParquetFile

pf = ParquetFile('adult.parq')
df = pf.to_pandas()

df.to_csv('adult.csv')
Exemplo n.º 27
0
def test_empty_df():
    p = ParquetFile(os.path.join(TEST_DATA, "empty.parquet"))
    df = p.to_pandas()
    assert list(p.columns) == ['a', 'b', 'c', '__index_level_0__']
    assert len(df) == 0
Exemplo n.º 28
0
def _determine_pf_parts(fs, paths, gather_statistics, ignore_metadata_file, **kwargs):
    """Determine how to access metadata and break read into ``parts``

    This logic is mostly to handle `gather_statistics=False` cases,
    because this also means we should avoid scanning every file in the
    dataset.  If _metadata is available, set `gather_statistics=True`
    (if `gather_statistics=None`).
    """
    parts = []
    if len(paths) > 1:
        paths, base, fns = _sort_and_analyze_paths(paths, fs)

        # Check if _metadata is in paths, and
        # remove it if ignore_metadata_file=True
        _metadata_exists = "_metadata" in fns
        if _metadata_exists and ignore_metadata_file:
            fns.remove("_metadata")
            paths = [fs.sep.join([base, fn]) for fn in fns]
            _metadata_exists = False

        if gather_statistics is not False:
            # This scans all the files, allowing index/divisions
            # and filtering
            if _metadata_exists:
                paths_use = fs.sep.join([base, "_metadata"])
            else:
                paths_use = paths
            pf = ParquetFile(
                paths_use, open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})
            )
        else:
            if _metadata_exists:
                # We have a _metadata file, lets use it
                pf = ParquetFile(
                    fs.sep.join([base, "_metadata"]),
                    open_with=fs.open,
                    sep=fs.sep,
                    **kwargs.get("file", {}),
                )
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {}))
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                parts = paths.copy()
    elif fs.isdir(paths[0]):
        # This is a directory.
        # Check if _metadata and/or _common_metadata files exists
        base = paths[0]
        _metadata_exists = _common_metadata_exists = True
        if not ignore_metadata_file:
            _metadata_exists = fs.isfile(fs.sep.join([base, "_metadata"]))
            _common_metadata_exists = fs.isfile(fs.sep.join([base, "_common_metadata"]))

        # Find all files if we are not using a _metadata file
        if ignore_metadata_file or not _metadata_exists:
            # For now, we need to discover every file under paths[0]
            paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs)
            _common_metadata_exists = "_common_metadata" in fns
            if "_metadata" in fns:
                fns.remove("_metadata")
                paths = [fs.sep.join([base, fn]) for fn in fns]
            _metadata_exists = False

        if _metadata_exists:
            # Using _metadata file (best-case scenario)
            pf = ParquetFile(
                fs.sep.join([base, "_metadata"]),
                open_with=fs.open,
                sep=fs.sep,
                **kwargs.get("file", {}),
            )
            if gather_statistics is None:
                gather_statistics = True

        elif gather_statistics is not False:
            # Scan every file
            pf = ParquetFile(paths, open_with=fs.open, **kwargs.get("file", {}))
        else:
            # Use _common_metadata file if it is available.
            # Otherwise, just use 0th file
            if _common_metadata_exists:
                pf = ParquetFile(
                    fs.sep.join([base, "_common_metadata"]),
                    open_with=fs.open,
                    **kwargs.get("file", {}),
                )
                fns.remove("_common_metadata")
            else:
                pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {}))
            scheme = get_file_scheme(fns)
            pf.file_scheme = scheme
            pf.cats = paths_to_cats(fns, scheme)
            parts = [fs.sep.join([base, fn]) for fn in fns]
    else:
        # There is only one file to read
        base = None
        pf = ParquetFile(
            paths[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})
        )

    # Ensure that there is no overlap between partition columns
    # and explicit columns in `pf`
    if pf.cats:
        _partitions = [p for p in pf.cats if p not in pf.columns]
        if not _partitions:
            pf.cats = {}
        elif len(_partitions) != len(pf.cats):
            raise ValueError(
                "No partition-columns should be written in the \n"
                "file unless they are ALL written in the file.\n"
                "columns: {} | partitions: {}".format(pf.columns, pf.cats.keys())
            )

    return parts, pf, gather_statistics, base
Exemplo n.º 29
0
#############
## PARQUET ##
#############

import pandas as pd
from fastparquet import write, ParquetFile
from pyspark.sql import SQLContext

## Dummy variables
fPath = '/path/to/file1.parq'
df1 = pd.DataFrame([[1,'B','C'],[2,'D','E']], columns=list('abc'))

# Fastparquet
write(fPath, df1, append=False) # Write; Read
df2 = ParquetFile(fPath).to_pandas(['a','b'], filters=[('a','in',[1])])
df3 = df2[df2.a==1] # Subset

# SqlContext
sqlCtx.createDataFrame(df1).write.parquet(fPath) # Write directory
df4 = spark.read.parquet(fPath) # Read file | directory

##########
## AVRO ##
##########

import pandas as pd
from fastavro import writer, reader

## Schema declaration
schema = {
    'doc'       : 'Counting Office Inventory.'  ,
Exemplo n.º 30
0
                    metavar='id',
                    type=str,
                    help='Event id of interest')
args = parser.parse_args()
event_id = args.event_id

if os.path.exists('events_with_dma.json'):
    combined_data = pd.read_json('events_with_dma.json',
                                 orient='records').set_index('event_id')
    find_recommended_events(combined_data, event_id)

else:
    datafiles = [f for f in glob('*.snappy.parquet')]
    print(datafiles)
    combined_data = pd.concat([
        ParquetFile(filename).to_pandas() for filename in datafiles
    ]).set_index('event_id')
    dma = pd.read_csv('DMA-zip.csv').set_index('ZIPCODE')
    event_dmas = []

    for z in combined_data['venue_zip']:
        if len(z) == 5:
            if int(z) in dma.index:
                event_dmas.append(str(dma.loc[int(z), 'DMA CODE']))
            else:
                event_dmas.append(np.nan)
        else:
            if int(z[:5]) in dma.index:
                event_dmas.append(str(dma.loc[int(z[:5]), 'DMA CODE']))
            else:
                event_dmas.append(np.nan)