예제 #1
0
파일: hdf.py 프로젝트: zhuomingliang/pandas
class HDFStorePanel(BaseIO):

    goal_time = 0.2

    def setup(self):
        self.fname = '__test__.h5'
        with warnings.catch_warnings(record=True):
            self.p = Panel(np.random.randn(20, 1000, 25),
                           items=['Item%03d' % i for i in range(20)],
                           major_axis=date_range('1/1/2000', periods=1000),
                           minor_axis=['E%03d' % i for i in range(25)])
            self.store = HDFStore(self.fname)
            self.store.append('p1', self.p)

    def teardown(self):
        self.store.close()
        self.remove(self.fname)

    def time_read_store_table_panel(self):
        with warnings.catch_warnings(record=True):
            self.store.select('p1')

    def time_write_store_table_panel(self):
        with warnings.catch_warnings(record=True):
            self.store.append('p2', self.p)
예제 #2
0
class HdfStore(DataStore):
    complevel = 9
    complib = "blosc:zstd"

    def __init__(self,
                 path: str,
                 table: str,
                 compute: Optional[Callable] = None) -> None:
        self.table = table
        if compute:
            self.store = PandasHDFStore(path,
                                        complevel=self.complevel,
                                        complib=self.complib)
            dataframe = compute()
            dataframe.sort_values(by="where", axis=0, inplace=True)
            self._mangle_where(dataframe)
            self.store.put(
                self.table,
                dataframe,
                append=False,
                format="table",
                expectedrows=len(dataframe),
                data_columns=[
                    "where_", "where_type", "who", "who_type", "when",
                    "when_type"
                ],
            )
        else:
            self.store = PandasHDFStore(path,
                                        complevel=self.complevel,
                                        complib=self.complib,
                                        mode="r")

    def query(self, query: str) -> DataFrame:
        query = self._mangle_where_in_query(query)
        df = self.store.select(self.table, where=query)
        self._unmangle_where(df)
        return df

    def _mangle_where(self, df: DataFrame) -> None:
        # See: https://github.com/PyTables/PyTables/issues/638
        df.rename(columns={"where": "where_"}, inplace=True)

    def _unmangle_where(self, df: DataFrame) -> None:
        # See: https://github.com/PyTables/PyTables/issues/638
        df.rename(columns={"where_": "where"}, inplace=True)

    def _mangle_where_in_query(
            self, query: Union[str, List[str]]) -> Union[str, List[str]]:
        # See: https://github.com/PyTables/PyTables/issues/638
        if isinstance(query, str):
            return re.sub("where([^_])", "where_\\1", query)
        else:
            return [
                self._mangle_where_in_query(subquery) for subquery in query
            ]
예제 #3
0
def predict_lm_per_store(data: pd.HDFStore,
                         select_idx: pd.Index,
                         features,
                         sales,
                         save_fit=False):
    store_train = data.select('train', select_idx,
                              columns=list(features)).set_index(select_idx)
    assert store_train.shape == (len(select_idx), len(features))
    logger.debug('Store train shape {}'.format(store_train.shape))
    logger.debug('Sales shape {}'.format(sales.shape))
    lm = linear_model.LinearRegression()
    fit = lm.fit(store_train, sales)

    pred = fit.predict(store_train)
    store_train['PredictedSales'] = pred
    return store_train
예제 #4
0
    def aggregate(hdf_store_loc,
                  file_pattern,
                  headerfile=None,
                  remove_part_files=False):
        df = None

        store = HDFStore(hdf_store_loc)
        store_keys = [w.replace('/', '') for w in store.keys()]

        print(
            f'Aggregating part files in {hdf_store_loc} for {file_pattern} into single file'
        )

        for key in store_keys:
            if re.match(file_pattern.replace('*', '.+'), key):
                print(
                    f'********************* Key : {key} MAtches pattern : {file_pattern.replace("*",".+")}'
                )
                #thisdf = pd.read_hdf(store_loc, key)
                thisdf = store.select(key)

                if df is None:
                    df = thisdf
                else:
                    #' for gz file that not have headers assign headers.
                    try:
                        df = df.append(thisdf, ignore_index=True, sort=True)
                    except Exception as e:
                        print('Error while joining data {e}')

                if remove_part_files:
                    store.remove(key)

        try:
            #df.to_hdf(store_loc, key=file_pattern.replace('*',''))
            store.put(key=file_pattern.replace('*', ''), value=df)
        except Exception as e:
            print(
                f'Exception while combining flile for {file_pattern} exception {e}'
            )

        store.close()
예제 #5
0

def from_one_to_three(table, entity):
    return [
        name
        for name, column in model.column_by_name.iteritems()
        if name in table.columns and column.entity == entity
        ]


# on peut en profiter pour faire l'index ici ? Ca tournerait un peu plus vite
# mais surtout de maniere plus "essentielle"

for year in available_years:
    print "debut de l annee %s" %year
    table_in_one = store.select('survey_'+str(year))
    # delete some people on every table according to test_ident.py results
    print len(table_in_one)
    table_in_one =  table_in_one[ - table_in_one['idfam'].isin([700986003, 700202209, 700150006,
                                                                700165702, 701609502,
                                                                801132105, 802846205, 800571404,
                                                                901461205,
                                                                800199302, 802008401, 800422201, 802738601,
                                                                903972102, 901676301, 900817401])]
    table_in_one =  table_in_one[ - table_in_one['idmen'].isin([8009658,9046607,
                                                                8020084, 8001993, 8004222, 8027386,
                                                                9039721, 9047848, 9016763]) ]
    print len(table_in_one)
    for entity in ['ind','foy','men','fam']:
        key = 'survey_'+str(year) + '/'+str(entity)
        vars_entity = from_one_to_three(table_in_one,entity)
예제 #6
0
파일: hdf.py 프로젝트: zhuomingliang/pandas
class HDFStoreDataFrame(BaseIO):

    goal_time = 0.2

    def setup(self):
        N = 25000
        index = tm.makeStringIndex(N)
        self.df = DataFrame(
            {
                'float1': np.random.randn(N),
                'float2': np.random.randn(N)
            },
            index=index)
        self.df_mixed = DataFrame(
            {
                'float1': np.random.randn(N),
                'float2': np.random.randn(N),
                'string1': ['foo'] * N,
                'bool1': [True] * N,
                'int1': np.random.randint(0, N, size=N)
            },
            index=index)
        self.df_wide = DataFrame(np.random.randn(N, 100))
        self.start_wide = self.df_wide.index[10000]
        self.stop_wide = self.df_wide.index[15000]
        self.df2 = DataFrame(
            {
                'float1': np.random.randn(N),
                'float2': np.random.randn(N)
            },
            index=date_range('1/1/2000', periods=N))
        self.start = self.df2.index[10000]
        self.stop = self.df2.index[15000]
        self.df_wide2 = DataFrame(np.random.randn(N, 100),
                                  index=date_range('1/1/2000', periods=N))
        self.df_dc = DataFrame(np.random.randn(N, 10),
                               columns=['C%03d' % i for i in range(10)])

        self.fname = '__test__.h5'

        self.store = HDFStore(self.fname)
        self.store.put('fixed', self.df)
        self.store.put('fixed_mixed', self.df_mixed)
        self.store.append('table', self.df2)
        self.store.append('table_mixed', self.df_mixed)
        self.store.append('table_wide', self.df_wide)
        self.store.append('table_wide2', self.df_wide2)

    def teardown(self):
        self.store.close()
        self.remove(self.fname)

    def time_read_store(self):
        self.store.get('fixed')

    def time_read_store_mixed(self):
        self.store.get('fixed_mixed')

    def time_write_store(self):
        self.store.put('fixed_write', self.df)

    def time_write_store_mixed(self):
        self.store.put('fixed_mixed_write', self.df_mixed)

    def time_read_store_table_mixed(self):
        self.store.select('table_mixed')

    def time_write_store_table_mixed(self):
        self.store.append('table_mixed_write', self.df_mixed)

    def time_read_store_table(self):
        self.store.select('table')

    def time_write_store_table(self):
        self.store.append('table_write', self.df)

    def time_read_store_table_wide(self):
        self.store.select('table_wide')

    def time_write_store_table_wide(self):
        self.store.append('table_wide_write', self.df_wide)

    def time_write_store_table_dc(self):
        self.store.append('table_dc_write', self.df_dc, data_columns=True)

    def time_query_store_table_wide(self):
        self.store.select('table_wide',
                          where="index > self.start_wide and "
                          "index < self.stop_wide")

    def time_query_store_table(self):
        self.store.select('table',
                          where="index > self.start and "
                          "index < self.stop")

    def time_store_repr(self):
        repr(self.store)

    def time_store_str(self):
        str(self.store)

    def time_store_info(self):
        self.store.info()
예제 #7
0
def test_multiple_open_close(setup_path):
    # gh-4409: open & close multiple times

    with ensure_clean_path(setup_path) as path:

        df = tm.makeDataFrame()
        df.to_hdf(path, "df", mode="w", format="table")

        # single
        store = HDFStore(path)
        assert "CLOSED" not in store.info()
        assert store.is_open

        store.close()
        assert "CLOSED" in store.info()
        assert not store.is_open

    with ensure_clean_path(setup_path) as path:

        if pytables._table_file_open_policy_is_strict:
            # multiples
            store1 = HDFStore(path)
            msg = (
                r"The file [\S]* is already opened\.  Please close it before "
                r"reopening in write mode\."
            )
            with pytest.raises(ValueError, match=msg):
                HDFStore(path)

            store1.close()
        else:

            # multiples
            store1 = HDFStore(path)
            store2 = HDFStore(path)

            assert "CLOSED" not in store1.info()
            assert "CLOSED" not in store2.info()
            assert store1.is_open
            assert store2.is_open

            store1.close()
            assert "CLOSED" in store1.info()
            assert not store1.is_open
            assert "CLOSED" not in store2.info()
            assert store2.is_open

            store2.close()
            assert "CLOSED" in store1.info()
            assert "CLOSED" in store2.info()
            assert not store1.is_open
            assert not store2.is_open

            # nested close
            store = HDFStore(path, mode="w")
            store.append("df", df)

            store2 = HDFStore(path)
            store2.append("df2", df)
            store2.close()
            assert "CLOSED" in store2.info()
            assert not store2.is_open

            store.close()
            assert "CLOSED" in store.info()
            assert not store.is_open

            # double closing
            store = HDFStore(path, mode="w")
            store.append("df", df)

            store2 = HDFStore(path)
            store.close()
            assert "CLOSED" in store.info()
            assert not store.is_open

            store2.close()
            assert "CLOSED" in store2.info()
            assert not store2.is_open

    # ops on a closed store
    with ensure_clean_path(setup_path) as path:

        df = tm.makeDataFrame()
        df.to_hdf(path, "df", mode="w", format="table")

        store = HDFStore(path)
        store.close()

        msg = r"[\S]* file is not open!"
        with pytest.raises(ClosedFileError, match=msg):
            store.keys()

        with pytest.raises(ClosedFileError, match=msg):
            "df" in store

        with pytest.raises(ClosedFileError, match=msg):
            len(store)

        with pytest.raises(ClosedFileError, match=msg):
            store["df"]

        with pytest.raises(ClosedFileError, match=msg):
            store.select("df")

        with pytest.raises(ClosedFileError, match=msg):
            store.get("df")

        with pytest.raises(ClosedFileError, match=msg):
            store.append("df2", df)

        with pytest.raises(ClosedFileError, match=msg):
            store.put("df3", df)

        with pytest.raises(ClosedFileError, match=msg):
            store.get_storer("df2")

        with pytest.raises(ClosedFileError, match=msg):
            store.remove("df2")

        with pytest.raises(ClosedFileError, match=msg):
            store.select("df")

        msg = "'HDFStore' object has no attribute 'df'"
        with pytest.raises(AttributeError, match=msg):
            store.df
예제 #8
0
def test_frame_select_complex2(setup_path):

    with ensure_clean_path(["params.hdf", "hist.hdf"]) as paths:

        pp, hh = paths

        # use non-trivial selection criteria
        params = DataFrame({"A": [1, 1, 2, 2, 3]})
        params.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"])

        selection = read_hdf(pp, "df", where="A=[2,3]")
        hist = DataFrame(
            np.random.randn(25, 1),
            columns=["data"],
            index=MultiIndex.from_tuples([(i, j) for i in range(5)
                                          for j in range(5)],
                                         names=["l1", "l2"]),
        )

        hist.to_hdf(hh, "df", mode="w", format="table")

        expected = read_hdf(hh, "df", where="l1=[2, 3, 4]")

        # scope with list like
        l0 = selection.index.tolist()  # noqa:F841
        store = HDFStore(hh)
        result = store.select("df", where="l1=l0")
        tm.assert_frame_equal(result, expected)
        store.close()

        result = read_hdf(hh, "df", where="l1=l0")
        tm.assert_frame_equal(result, expected)

        # index
        index = selection.index  # noqa:F841
        result = read_hdf(hh, "df", where="l1=index")
        tm.assert_frame_equal(result, expected)

        result = read_hdf(hh, "df", where="l1=selection.index")
        tm.assert_frame_equal(result, expected)

        result = read_hdf(hh, "df", where="l1=selection.index.tolist()")
        tm.assert_frame_equal(result, expected)

        result = read_hdf(hh, "df", where="l1=list(selection.index)")
        tm.assert_frame_equal(result, expected)

        # scope with index
        store = HDFStore(hh)

        result = store.select("df", where="l1=index")
        tm.assert_frame_equal(result, expected)

        result = store.select("df", where="l1=selection.index")
        tm.assert_frame_equal(result, expected)

        result = store.select("df", where="l1=selection.index.tolist()")
        tm.assert_frame_equal(result, expected)

        result = store.select("df", where="l1=list(selection.index)")
        tm.assert_frame_equal(result, expected)

        store.close()
예제 #9
0
파일: hdf.py 프로젝트: ygene2/pandas
class HDFStoreDataFrame(BaseIO):
    def setup(self):
        N = 25000
        index = tm.makeStringIndex(N)
        self.df = DataFrame(
            {"float1": np.random.randn(N), "float2": np.random.randn(N)}, index=index
        )
        self.df_mixed = DataFrame(
            {
                "float1": np.random.randn(N),
                "float2": np.random.randn(N),
                "string1": ["foo"] * N,
                "bool1": [True] * N,
                "int1": np.random.randint(0, N, size=N),
            },
            index=index,
        )
        self.df_wide = DataFrame(np.random.randn(N, 100))
        self.start_wide = self.df_wide.index[10000]
        self.stop_wide = self.df_wide.index[15000]
        self.df2 = DataFrame(
            {"float1": np.random.randn(N), "float2": np.random.randn(N)},
            index=date_range("1/1/2000", periods=N),
        )
        self.start = self.df2.index[10000]
        self.stop = self.df2.index[15000]
        self.df_wide2 = DataFrame(
            np.random.randn(N, 100), index=date_range("1/1/2000", periods=N)
        )
        self.df_dc = DataFrame(
            np.random.randn(N, 10), columns=["C%03d" % i for i in range(10)]
        )

        self.fname = "__test__.h5"

        self.store = HDFStore(self.fname)
        self.store.put("fixed", self.df)
        self.store.put("fixed_mixed", self.df_mixed)
        self.store.append("table", self.df2)
        self.store.append("table_mixed", self.df_mixed)
        self.store.append("table_wide", self.df_wide)
        self.store.append("table_wide2", self.df_wide2)

    def teardown(self):
        self.store.close()
        self.remove(self.fname)

    def time_read_store(self):
        self.store.get("fixed")

    def time_read_store_mixed(self):
        self.store.get("fixed_mixed")

    def time_write_store(self):
        self.store.put("fixed_write", self.df)

    def time_write_store_mixed(self):
        self.store.put("fixed_mixed_write", self.df_mixed)

    def time_read_store_table_mixed(self):
        self.store.select("table_mixed")

    def time_write_store_table_mixed(self):
        self.store.append("table_mixed_write", self.df_mixed)

    def time_read_store_table(self):
        self.store.select("table")

    def time_write_store_table(self):
        self.store.append("table_write", self.df)

    def time_read_store_table_wide(self):
        self.store.select("table_wide")

    def time_write_store_table_wide(self):
        self.store.append("table_wide_write", self.df_wide)

    def time_write_store_table_dc(self):
        self.store.append("table_dc_write", self.df_dc, data_columns=True)

    def time_query_store_table_wide(self):
        self.store.select(
            "table_wide", where="index > self.start_wide and index < self.stop_wide"
        )

    def time_query_store_table(self):
        self.store.select("table", where="index > self.start and index < self.stop")

    def time_store_repr(self):
        repr(self.store)

    def time_store_str(self):
        str(self.store)

    def time_store_info(self):
        self.store.info()
예제 #10
0
#!/usr/bin/env python

import pandas as pd
from pandas import HDFStore, bdate_range
from pandas.tseries.offsets import BDay

crsp = HDFStore('/home/chad/WrdsData/hdf/crsp/crsp.h5')
famafrench = HDFStore('/home/chad/WrdsData/hdf/famafrench/famafrench.h5')
DAILY_FACTORS = famafrench.select('/famafrench/factors_daily')


class Event(object):
    def __init__(self,
                 id,
                 evt_date,
                 gap=5,
                 est_period=252,
                 frequency='B',
                 evt_start=-2,
                 evt_end=2):
        self._id = id
        self.evt_date = pd.to_datetime(evt_date)
        self.frequency = frequency
        self._has_data = self._has_models = False
        self.evt_window = bdate_range(start=evt_date - BDay(abs(evt_start)),
                                      end=evt_date + BDay(evt_end))
        self.est_period = bdate_range(end=evt_date -
                                      BDay(abs(evt_start - gap)),
                                      periods=est_period)

    def run_study(self):
예제 #11
0
def glm_predictions(data: pd.HDFStore,
                    output: pd.HDFStore,
                    model_save_dir=None,
                    predict_train=True,
                    from_saved_model=False):
    # +-
    test_set_stores = data.select_column('test', 'Store').unique()
    ##
    if from_saved_model:
        if from_saved_model is True:
            glm = get_saved_glm_model(model_save_dir)
        else:
            glm = get_saved_glm_model(from_saved_model)

    else:

        ##
        logger.info("Dropping store data before changepoint.")
        select_idx = remove_before_changepoint(data, None)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.info("Dropping stores not in test set. Initial shape")
        idx = data.select_as_coordinates('train', 'Store in test_set_stores')
        select_idx = select_idx.intersection(idx)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.debug("Log transform on sales data")
        idx = data.select_as_coordinates('train', 'Sales > 0')
        select_idx = select_idx.intersection(idx)
        with warnings_to_log('divide by zero'):
            data.put('train_logsales',
                     np.log(data.select('train', 'columns = Sales')),
                     data_columns=True)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        select_idx = remove_outliers_lm(data, select_idx, log_lm_features,
                                        test_set_stores)
        logger.info("Removed outliers, reduced shape {0}".format(
            len(select_idx)))

        ##
        logger.info("Running glm training")
        X = DataFromHDF(data_store=data,
                        key='train',
                        select_idx=select_idx,
                        columns=linear_features)
        y = DataFromHDF(data_store=data,
                        key='train_logsales',
                        select_idx=select_idx,
                        column='Sales')
        glm = GLMPredictions(stores=test_set_stores, steps=15, step_by=3)
        glm.fit(X, y)

        ##
        if model_save_dir:
            glm.save_model(model_save_dir)

    ##
    logger.info("glm predictions on test set")
    X = DataFromHDF(data_store=data, key='test', columns=linear_features)
    glm_output = DataFromHDF(data_store=output,
                             key='test/glm',
                             data_columns=True)
    preds = glm.predict(X)
    glm_output.put(preds)

    ##
    if predict_train:
        logger.info("glm predictions on training set")
        X = DataFromHDF(data_store=data, key='train', columns=linear_features)
        glm_output = DataFromHDF(data_store=output,
                                 key='train/glm',
                                 data_columns=True)
        preds = glm.predict(X)
        glm_output.put(preds)
예제 #12
0
def xgb_predictions(data: pd.HDFStore,
                    output: pd.HDFStore,
                    model_save_dir=None,
                    predict_train=True,
                    from_saved_model=False):
    # +-
    ##
    # noinspection PyUnusedLocal
    test_set_stores = data.select_column('test', 'Store').unique()

    if from_saved_model:
        if from_saved_model is True:
            xgb = get_saved_xgb_model(model_save_dir)
        else:
            xgb = get_saved_xgb_model(from_saved_model)

    else:

        logger.info("Dropping store data before changepoint.")
        select_idx = remove_before_changepoint(data, None)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.info("Dropping stores not in test set. Initial shape")
        idx = data.select_as_coordinates('train', 'Store in test_set_stores')
        select_idx = select_idx.intersection(idx)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.debug("Log transform on sales data")
        idx = data.select_as_coordinates('train', 'Sales > 0')
        select_idx = select_idx.intersection(idx)
        with warnings_to_log('divide by zero'):
            data.put('train_logsales',
                     np.log(data.select('train', 'columns = Sales')),
                     data_columns=True)
        logger.info("Reduced to {0}".format(len(select_idx)))

        ##
        logger.info("Running xgboost training")
        X = DataFromHDF(data_store=data,
                        key='train',
                        select_idx=select_idx,
                        columns=xgb_features)
        y = DataFromHDF(data_store=data,
                        key='train_logsales',
                        select_idx=select_idx,
                        column='Sales')
        xgb = XGBPredictions(eval_function=xgb_expm1_rmspe,
                             params=xparams,
                             nrounds=3000)
        xgb.fit(X, y)

        ##
        if model_save_dir:
            xgb.save_model(model_save_dir)

    ##
    logger.info("xgboost predictions on test set")
    X = DataFromHDF(data_store=data, key='test', columns=xgb_features)
    xgb_output = DataFromHDF(data_store=output,
                             key='test/xgb',
                             data_columns=True)
    preds = xgb.predict(X)
    xgb_output.put(preds)

    ##
    if predict_train:
        logger.info("xgboost predictions on training set")
        xgb_output = DataFromHDF(data_store=output,
                                 key='train/xgb',
                                 data_columns=True)
        select_idx = data.select_as_coordinates('train',
                                                'Store in test_set_stores')
        X = DataFromHDF(data_store=data,
                        key='train',
                        select_idx=select_idx,
                        columns=xgb_features)
        predict_in_chunks(xgb, X, xgb_output)
예제 #13
0
    store.put('df', df, data_columns=True, format='table')
    print df

    store.close()
    # store['df']  # load it
    #
    # Read hdf5 by chunks
    # https://towardsdatascience.com/why-and-how-to-use-pandas-with-large-data-9594dda2ea4c
    # https://stackoverflow.com/questions/40348945/reading-data-by-chunking-with-hdf5-and-pandas
    rd_store = HDFStore(fn)
    #
    # df = pd.DataFrame(columns=columns)
    # chunksize = 4096
    # # %%timeit
    # # for chunk in pd.read_hdf(fn, 'df', chunksize=chunksize, where='h_m < 5.3'):
    # #     df = pd.concat([df, chunk], ignore_index=True)
    #
    # # sel by time
    # # https://stackoverflow.com/questions/25681308/pandas-read-hdf-query-by-date-and-time-range
    # # Может лучше не таблицей хранить если выбирать по времени
    c = rd_store.select_column('df', 'timeticket')

    where = pd.DatetimeIndex(c).indexer_between_time('12:00', '16:56')
    #
    resp = rd_store.select('df', where=where)
    # print resp.info()
    print resp

    # Another selection
    # https://stackoverflow.com/questions/20502996/use-or-in-hdfstore-select-pandas