예제 #1
0
 def to_hdf5(self, fname, complevel=9, complib='bzip2'):
     if os.path.exists(fname):
         logger.warning('Overwrite %s with current history', fname)
     history_store = HDFStore(fname, mode='w', complevel=complevel, complib=complib)
     for attribute in self._store_attributes:
         history_store[attribute] = getattr(self, attribute)
     history_store.close()
예제 #2
0
    def test_append(self):
        pth = '__test_append__.h5'

        try:
            store = HDFStore(pth)

            df = tm.makeTimeDataFrame()
            store.append('df1', df[:10])
            store.append('df1', df[10:])
            tm.assert_frame_equal(store['df1'], df)

            store.put('df2', df[:10], table=True)
            store.append('df2', df[10:])
            tm.assert_frame_equal(store['df2'], df)

            wp = tm.makePanel()
            store.append('wp1', wp.ix[:,:10,:])
            store.append('wp1', wp.ix[:,10:,:])
            tm.assert_panel_equal(store['wp1'], wp)

        except:
            raise
        finally:
            store.close()
            os.remove(pth)
예제 #3
0
def quantitative_analysis(df_name, df_seq_col, df_quant_col, func=lambda x: x):
    print "Quantitative analysis of ", df_name

    store = HDFStore('_data_/ProteinDataStore.h5')
    summary = store['DataBases_Summary']
    df = store[df_name]
    df = df[[df_seq_col, df_quant_col]]
    renamed_col = '_'.join(df_quant_col.split(' '))
    print "Filling column ", renamed_col
    summary[renamed_col] = ['.'] * len(summary)
    print "Current summary shape: ", summary.shape

    seq_list = map(lambda x: re.sub(r'[^A-Z]', '', x), df[df_seq_col].values)
    for i in zip(seq_list, df[df_quant_col].values):
        query = np.where(summary['GlyGly Probabilities'] == i[0])[0]
        if len(query) != 0:
            index = query[0]
        else:
            print "Omitted data: ", i
            continue

        if not np.isnan(i[1]):
            try:
                tmp = func(i[1])
                summary.loc[index, renamed_col] = tmp
            except Exception as e:
                print i
                print e.message
        else:
            summary.loc[index, renamed_col] = '.'

    store['DataBases_Summary'] = summary
    store.close()
예제 #4
0
def update_exchanges():
    """
    Updates data for exchanges such as NYSE

    """

    ####### LOAD DATE RANGES AND SYMBOLS
    start_date = Config(CFG).get('Exchange Data Start Date', 'default_start_date')
    end_date = datetime.datetime.now().strftime('%Y-%m-%d')
    symbols = [Config(CFG).get('Symbol List', 'list')]
    ####### BACKUP and UPDATE DB
    filename = Config(CFG).get("DB Locations", 'exchange_data')
    backup = Config(CFG).get("DB Locations", 'exchange_data_backup')
    file_update_backup(filename, backup)
    ####### START HDF5 INSTANCE
    operator = HDFStore(filename)

    for symbol in symbols:

        ####### PULL YAHOO FINANCE DATA
        data = get_daily_history(symbol, start_date, end_date)
        ####### PULL ADVANCES/DECLINES DATA
        data = data.merge(update_unicorn(symbol), left_index=True, right_index=True, how='outer')
        ####### SAVE DATA TO HDF5
        operator[symbol] = data

    operator.close()
예제 #5
0
def hdf():
    df = ts.get_hist_data('000875')
    #     df.to_hdf('c:/day/store.h5','table')

    store = HDFStore('c:/day/store.h5')
    store['000875'] = df
    store.close()
예제 #6
0
def drop_with_low_probability(storename, df_name, loc_probability_colname, threshold=0.95):
    print 'Filtering by low probability in', df_name
    store = HDFStore(storename)
    df = store[df_name]
    if loc_probability_colname is not None:
        df = df[df[loc_probability_colname] >= threshold]
    store[df_name] = df
    store.close()
예제 #7
0
 def test_legacy_table_read(self):
     # legacy table types
     pth = curpath()
     store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r')
     store.select('df1')
     store.select('df2')
     store.select('wp1')
     store.close()
예제 #8
0
def reindex_summary():
    store = HDFStore('_data_/ProteinDataStore.h5')
    data_summary = store['DataBases_Summary']
    range_index = [x for x in np.arange(len(data_summary))]
    print "Reindexing..."
    data_summary = data_summary.set_index([range_index])
    store['DataBases_Summary'] = data_summary
    store.close()
예제 #9
0
 def test_legacy_read(self):
     pth = curpath()
     store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r')
     store['a']
     store['b']
     store['c']
     store['d']
     store.close()
예제 #10
0
 def _check_roundtrip(self, obj, comparator):
     store = HDFStore(self.scratchpath, 'w')
     try:
         store['obj'] = obj
         retrieved = store['obj']
         comparator(retrieved, obj)
     finally:
         store.close()
         os.remove(self.scratchpath)
예제 #11
0
def colorful_dump_summary_to_excel(output_filename, range_label='L1:U36229'):
    # < -2 dark green
    # -2 to -1 light green
    # -1 to  1 yellow
    # 1 to 2 Orange
    # > 2 red
    store = HDFStore('_data_/ProteinDataStore.h5')
    data_summary = store['DataBases_Summary']
    writer = ExcelWriter(output_filename + '.xlsx', engine='xlsxwriter')
    data_summary.to_excel(writer, 'DataBases_Summary', index=True)

    workbook = writer.book
    worksheet = writer.sheets['DataBases_Summary']

    # using pallete http://www.colourlovers.com/palette/3687876/
    blue = workbook.add_format({'bg_color': '#69D2E7', 'font_color': '#000000'})
    coral = workbook.add_format({'bg_color': '#A7DBD8', 'font_color': '#000000'})
    yellow = workbook.add_format({'bg_color': '#EAE319', 'font_color': '#000000'})
    orange = workbook.add_format({'bg_color': '#FA6900', 'font_color': '#000000'})
    red = workbook.add_format({'bg_color': '#E2434B', 'font_color': '#000000'})
    # empty = workbook.add_format({'bg_color': '#FFFFFF', 'font_color': '#000000'})
    #
    # worksheet.conditional_format(range_label, {'type': 'text',
    #                                            'criteria': 'begins with',
    #                                            'value': '.',
    #                                            'format': empty})

    worksheet.conditional_format(range_label, {'type': 'cell',
                                               'criteria': '<',
                                               'value': -2,
                                               'format': blue})

    worksheet.conditional_format(range_label, {'type': 'cell',
                                               'criteria': 'between',
                                               'minimum': -2,
                                               'maximum': -1,
                                               'format': coral})

    worksheet.conditional_format(range_label, {'type': 'cell',
                                               'criteria': 'between',
                                               'minimum': -1,
                                               'maximum': 1,
                                               'format': yellow})

    worksheet.conditional_format(range_label, {'type': 'cell',
                                               'criteria': 'between',
                                               'minimum': 1,
                                               'maximum': 2,
                                               'format': orange})

    worksheet.conditional_format(range_label, {'type': 'cell',
                                               'criteria': '>',
                                               'value': 2,
                                               'format': red})
    writer.save()
    store.close()
예제 #12
0
 def _check_roundtrip_table(self, obj, comparator):
     store = HDFStore(self.scratchpath, 'w')
     try:
         store.put('obj', obj, table=True)
         retrieved = store['obj']
         sorted_obj = _test_sort(obj)
         comparator(retrieved, sorted_obj)
     finally:
         store.close()
         os.remove(self.scratchpath)
예제 #13
0
def parse_one_and_save(input_file, output_store_name):
    sheet_name = 'All sites'
    skip_rows = [0]
    store = HDFStore(output_store_name)
    df = pd.ExcelFile(input_file).parse(sheetname=sheet_name,
                                        skiprows=skip_rows)
    name = (input_file.split('/')[1]).split('.')[0]
    print "Parsing ", name
    store[name] = df
    store.close()
예제 #14
0
 def load(self, format='csv'):
     savefile = self.__savefile()
     if format == "csv":
         self.frame.from_csv(savefile + ".csv")
     elif format == "hdf":
         store = HDFStore(savefile + ".hdf")
         try:
             self.frame = store['data']
         finally:
             store.close()
예제 #15
0
def load_exchange_data(symbol):
    """
    Returns data for a specific exchange

    """
    filename = Config(CFG).get("DB Locations", 'exchange_data')
    operator = HDFStore(filename)
    data = operator[symbol]
    operator.close()
 
    return data
예제 #16
0
def parse_list_and_save(list_of_files, output_store_name):
    sheet_name = 'All sites'
    skip_rows = [0]
    store = HDFStore(output_store_name)
    for _file_ in list_of_files:
        df = pd.ExcelFile(_file_).parse(sheetname=sheet_name,
                                        skiprows=skip_rows)
        name = (_file_.split('/')[2]).split('.')[0]
        print "Parsing ", name
        store[name] = df
    store.close()
예제 #17
0
 def test_store_index_name(self):
     df = tm.makeDataFrame()
     df.index.name = 'foo'
     try:
         store = HDFStore(self.scratchpath)
         store['frame'] = df
         recons = store['frame']
         assert(recons.index.name == 'foo')
     finally:
         store.close()
         os.remove(self.scratchpath)
예제 #18
0
    def test_store_series_name(self):
        df = tm.makeDataFrame()
        series = df['A']

        try:
            store = HDFStore(self.scratchpath)
            store['series'] = series
            recons = store['series']
            assert(recons.name == 'A')
        finally:
            store.close()
            os.remove(self.scratchpath)
예제 #19
0
 def test_wide_table_dups(self):
     wp = tm.makePanel()
     try:
         store = HDFStore(self.scratchpath)
         store._quiet = True
         store.put('panel', wp, table=True)
         store.put('panel', wp, table=True, append=True)
         recons = store['panel']
         tm.assert_panel_equal(recons, wp)
     finally:
         store.close()
         os.remove(self.scratchpath)
예제 #20
0
    def test_legacy_table_write(self):
        # legacy table types
        pth = curpath()
        df = tm.makeDataFrame()
        wp = tm.makePanel()

        store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'a')

        self.assertRaises(Exception, store.append, 'df1', df)
        self.assertRaises(Exception, store.append, 'wp1', wp)

        store.close()
예제 #21
0
 def test_timezones(self):
     rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern')
     frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
     try:
         store = HDFStore(self.scratchpath)
         store['frame'] = frame
         recons = store['frame']
         self.assert_(recons.index.equals(rng))
         self.assertEquals(rng.tz, recons.index.tz)
     finally:
         store.close()
         os.remove(self.scratchpath)
예제 #22
0
    def _check_roundtrip(self, obj, comparator, compression=False, **kwargs):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store['obj'] = obj
            retrieved = store['obj']
            comparator(retrieved, obj, **kwargs)
        finally:
            store.close()
            os.remove(self.scratchpath)
예제 #23
0
파일: __init__.py 프로젝트: exedre/e4t.new
 def hload(self,fname):
     from pandas.io.pytables import HDFStore
     store = HDFStore(fname,mode='r')
     self.clear()
     read = []
     for k in store.keys():
         if re.match('^_MISSING',k):
             v = store.get(k).to_dict().values()
             self._missing = v
             continue
         name = re.sub('^/','',k)
         self[name]=store[k]
         read.append(name)
     store.close()
예제 #24
0
    def _check_roundtrip_table(self, obj, comparator, compression=False):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store.put('obj', obj, table=True)
            retrieved = store['obj']
            sorted_obj = _test_sort(obj)
            comparator(retrieved, sorted_obj)
        finally:
            store.close()
            os.remove(self.scratchpath)
예제 #25
0
파일: __init__.py 프로젝트: exedre/e4t.new
 def hsave(self,fname):
     from pandas.io.pytables import HDFStore
     LOGGER.debug('Saving HDF in %s\n%s',fname,self.report())
     store = HDFStore(fname,mode='w')
     for k,v in self.items():
         if re.match('^__',k):
             continue
         if isinstance(v,np.ndarray):
             v = Series(v)
         LOGGER.debug('Saving HDF for %s',k)
         store.put(k,v)
     if self._missing:
         store['_MISSING']=Series(self._missing)
     store.close()
예제 #26
0
    def _check_roundtrip_table(self, obj, comparator, compression=False):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store.put('obj', obj, table=True)
            retrieved = store['obj']
            sorted_obj = _test_sort(obj)
            comparator(retrieved, sorted_obj)
        finally:
            store.close()
            os.remove(self.scratchpath)
예제 #27
0
def load_historical_data(start=datetime(2010, 1, 1), end=datetime.today(), filename='stock_data.h5'):
    store = HDFStore(filename)

    with open('companylist.csv', 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in spamreader:
            print row[0]
            try:
                stock_info = web.DataReader(row[0], "yahoo", start, end)
                store[row[0]] = stock_info
            except:
                print "Error on", row[0]

    store.close()
예제 #28
0
    def hdfWrite(self, path, excode, symbol, indata, kind1, kind2, kind3):
        # kind1为 'Rawdata'、'Stitch'、'Indicator'
        # kind2为 '00' '01'
        # kind3为 '1d' '60m' '30m' '15m' '5m' '1m'
        # 写各个频率的Rawdata: kind1='Rawdata',kind2=None,kind3='1d'
        # 写StitchRule:       kind1='Stitch', kind2='00',kind3=None
        # 写StitchData:       kind1='Stitch', kind2='00',kind3='1d'
        # 写Indicator:       kind1='Indicator',kind2='Indicator_name',kind3='params'
        store = HDFStore(path, mode='a')
        if kind1 == EXT_Rawdata:
            key = kind1 + '/' + excode + '/' + symbol + '/' + kind3
        elif kind1 == EXT_Stitch:
            key = kind1 + '/' + excode + '/' + symbol + '/' + EXT_Rule + '/' + kind2 if kind3 == None else kind1 + '/' + excode + '/' + symbol + '/' + EXT_Period + '/' + kind3 + '/' + kind2
        elif kind1 == EXT_Indicator:
            key = kind1 + '/' + excode + '/' + symbol + kind2
        else:
            print("kind not supported")
            return

        if kind1 == EXT_Indicator:
            f = h5py.File(path, 'a')
            try:
                store[key]
            except KeyError:  # 路径不存在时创建
                store[key] = indata
                f[key].attrs['Params'] = kind3
            else:
                if f[key].attrs['Params'] == kind3:  #Params匹配时合并
                    adddata = indata[~indata.index.isin(store[key].index)]
                    store.append(key, adddata)
                else:  # Params不匹配时覆盖
                    store[key] = indata
                    f[key].attrs['Params'] = kind3
            f.close()
            store.close()
        else:
            try:
                store[key]
            except KeyError:
                store[key] = indata
            else:
                adddata = indata[~indata.index.isin(store[key].index)]
            if kind2 in [EXT_Series_00, EXT_Series_01]:
                adddata[EXT_Out_AdjFactor] = adddata[
                    EXT_Out_AdjFactor] * store[key][EXT_Out_AdjFactor].iloc[
                        -1] / adddata[EXT_Out_AdjFactor].iloc[0]
            store.append(key, adddata)
            store.close()
예제 #29
0
def read_archive(hdf_path, items=['train_x', 'valid_x', 'test_x', 'train_y', 'valid_y', 'test_y']):
	'''
	convenience function used for retrieving data within a hdf archive

	Args:
		hdf_path (str):
			fullpath of file which data is stored in

		items opt(list):
			items to be retrieved
			default: ['train_x', 'valid_x', 'test_x', 'train_y', 'valid_y', 'test_y']
	'''
	hdf = HDFStore(hdf_path)
	output = map(lambda x: hdf[x], items)
	hdf.close()
	return output
예제 #30
0
def make_summary(newcols):
    """

    :param newcols: column names in the main summary table
    :return: none
    """
    print "Making summary..."

    # open store end read base dataframe
    store = HDFStore('_data_/ProteinDataStore.h5')
    df1 = store['Mol_Cell_Proteomics_2011_Epub_2011_September1Supp2']

    # clean sequences
    LEN = len(df1)
    positions = [0] * LEN
    real_glygly = [0] * LEN
    clean_glygly = [0] * LEN
    for i in np.arange(LEN):
        positions[i] = df1['Position'].values[i]
        real_glygly[i] = clear_sequence(df1['GlyGly (K) Probabilities'].values[i])
        clean_glygly[i] = re.sub(r'[^A-Z]', '', real_glygly[i])

    # align with SwissProt Human and Rodents using blastp
    blastpID_HUMAN, blastpID_RODENTS = fetch_indentity_from_local_batch(clean_glygly)

    del df1
    print "Length test", len(positions) == len(real_glygly) == len(clean_glygly) == len(blastpID_HUMAN) == len(
        blastpID_RODENTS)

    # convert to pandas series
    clean_glygly = pd.Series(clean_glygly)
    blastpID_HUMAN = pd.Series(blastpID_HUMAN)
    blastpID_RODENTS = pd.Series(blastpID_RODENTS)

    # Create empty dataframe
    data_summary = pd.DataFrame(columns=newcols)

    # Combine everything required in dataframe
    data_summary['Position'] = positions
    data_summary['GlyGly (K) Probabilities'] = real_glygly
    data_summary['GlyGly Probabilities'] = clean_glygly
    data_summary['SP_ID_BLASTP_HUMAN'] = blastpID_HUMAN
    data_summary['SP_ID_BLASTP_RODENTS'] = blastpID_RODENTS

    # Save to HDF store
    store['DataBases_Summary'] = data_summary
    store.close()
예제 #31
0
    def _check_double_roundtrip(self, obj, comparator, compression=False,
                                **kwargs):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store['obj'] = obj
            retrieved = store['obj']
            comparator(retrieved, obj, **kwargs)
            store['obj'] = retrieved
            again = store['obj']
            comparator(again, obj, **kwargs)
        finally:
            store.close()
            os.remove(self.scratchpath)
예제 #32
0
 def hdfRead(self,
             path,
             excode,
             symbol,
             kind1,
             kind2,
             kind3,
             startdate=EXT_Start,
             enddate=EXT_End,
             is_stitch=True):
     # kind1为 'Rawdata',Stitch','Indicator'
     # kind2为 '00' '01'
     # kind3为 '1d' '60m' '30m' '15m' '5m' '1m'
     # 读各个频率的Rawdata: kind1='Rawdata',kind2=None,kind3='1d'
     # 读StitchRule:       kind1='Stitch', kind2='00',kind3=None
     # 读STitchData:       kind1='Stitch', kind2='00',kind3='1d'
     # 读Indicator:       kind1='Indicator',kind2='Indicator_name',kind3=None
     store = HDFStore(path, mode='r')
     if kind1 == EXT_Rawdata:
         key = '/'.join([kind1, excode, symbol, kind3])
     elif kind1 == EXT_Stitch:
         key = '/'.join([kind1, excode, symbol, EXT_Rule, kind2
                         ]) if kind3 == None else '/'.join([
                             kind1, excode, symbol, EXT_Period, kind3, kind2
                         ])
     elif kind1 == EXT_Indicator:
         key = '/'.join([kind1, excode, symbol, kind2])
     else:
         print("kind not supported")
         return
     data = store[key].ix[(
         (store[key].index.get_level_values(0) >= pd.to_datetime(startdate))
         & (store[key].index.get_level_values(0) <= pd.to_datetime(enddate))
     ), :]
     if kind1 == EXT_Stitch and is_stitch == True and kind3 != None:
         data[EXT_Bar_Open] = data[EXT_AdjFactor] * data[EXT_Bar_Open]
         data[EXT_Bar_High] = data[EXT_AdjFactor] * data[EXT_Bar_High]
         data[EXT_Bar_Low] = data[EXT_AdjFactor] * data[EXT_Bar_Low]
         data[EXT_Bar_Close] = data[EXT_AdjFactor] * data[EXT_Bar_Close]
     store.close()
     if kind1 == EXT_Indicator:
         f = h5py.File(path, 'r')
         params = f[key].attrs['Params']
         f.close()
         return data, params
     return data
예제 #33
0
    def test_big_table(self):
        raise nose.SkipTest('no big table')

        # create and write a big table
        wp = Panel(np.random.randn(20, 1000, 1000), items= [ 'Item%s' % i for i in xrange(20) ],
                   major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%s' % i for i in xrange(1000) ])

        wp.ix[:,100:200,300:400] = np.nan

        try:
            store = HDFStore(self.scratchpath)
            store._debug_memory = True
            store.append('wp',wp)
            recons = store.select('wp')
        finally:
            store.close()
            os.remove(self.scratchpath)
예제 #34
0
    def test_append(self):
        pth = '__test_append__.h5'

        try:
            store = HDFStore(pth)

            df = tm.makeTimeDataFrame()
            store.append('df1', df[:10])
            store.append('df1', df[10:])
            tm.assert_frame_equal(store['df1'], df)

            store.put('df2', df[:10], table=True)
            store.append('df2', df[10:])
            tm.assert_frame_equal(store['df2'], df)

            store.append('/df3', df[:10])
            store.append('/df3', df[10:])
            tm.assert_frame_equal(store['df3'], df)

            # this is allowed by almost always don't want to do it
            import warnings
            import tables
            warnings.filterwarnings('ignore', category=tables.NaturalNameWarning)
            store.append('/df3 foo', df[:10])
            store.append('/df3 foo', df[10:])
            tm.assert_frame_equal(store['df3 foo'], df)
            warnings.filterwarnings('always', category=tables.NaturalNameWarning)

            # panel
            wp = tm.makePanel()
            store.append('wp1', wp.ix[:,:10,:])
            store.append('wp1', wp.ix[:,10:,:])
            tm.assert_panel_equal(store['wp1'], wp)

            # ndim
            p4d = tm.makePanel4D()
            store.append('p4d', p4d.ix[:,:,:10,:])
            store.append('p4d', p4d.ix[:,:,10:,:])
            tm.assert_panel4d_equal(store['p4d'], p4d)

        except:
            raise
        finally:
            store.close()
            os.remove(pth)
예제 #35
0
    def test_legacy_table_read(self):
        # legacy table types
        pth = curpath()
        store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r')
        store.select('df1')
        store.select('df2')
        store.select('wp1')

        # force the frame
        store.select('df2', typ='legacy_frame')

        # old version (this still throws an exception though)
        import warnings
        warnings.filterwarnings('ignore', category=IncompatibilityWarning)
        self.assertRaises(Exception, store.select, 'wp1',
                          Term('minor_axis', '=', 'B'))
        warnings.filterwarnings('always', category=IncompatibilityWarning)

        store.close()
예제 #36
0
    def test_big_table(self):
        raise nose.SkipTest('no big table')

        # create and write a big table
        wp = Panel(np.random.randn(20, 1000, 1000),
                   items=['Item%s' % i for i in xrange(20)],
                   major_axis=date_range('1/1/2000', periods=1000),
                   minor_axis=['E%s' % i for i in xrange(1000)])

        wp.ix[:, 100:200, 300:400] = np.nan

        try:
            store = HDFStore(self.scratchpath)
            store._debug_memory = True
            store.append('wp', wp)
            recons = store.select('wp')
        finally:
            store.close()
            os.remove(self.scratchpath)
예제 #37
0
def analyze_existence(storename_to_append, gly_gly_seq_colname):
    print "Analyzing occurence in ", storename_to_append

    store = HDFStore('_data_/ProteinDataStore.h5')
    data_summary = store['DataBases_Summary']
    tmp_store_sequences = store[storename_to_append][gly_gly_seq_colname].values
    tmp_store_sequences = map(clear_sequence, tmp_store_sequences)

    # Make binary vector which represents existence
    # of the sequence in storename_to_append dataset
    existense_index = data_summary['GlyGly (K) Probabilities'].isin(tmp_store_sequences).values
    existense_index = np.asarray(existense_index, dtype=int)

    # Create new column in summary table
    data_summary[storename_to_append] = existense_index
    print np.sum(data_summary[storename_to_append])

    # Save to HDF store
    store['DataBases_Summary'] = data_summary
    store.close()
예제 #38
0
    def test_append(self):
        pth = '__test_append__.h5'

        try:
            store = HDFStore(pth)

            df = tm.makeTimeDataFrame()
            store.append('df1', df[:10])
            store.append('df1', df[10:])
            tm.assert_frame_equal(store['df1'], df)

            store.put('df2', df[:10], table=True)
            store.append('df2', df[10:])
            tm.assert_frame_equal(store['df2'], df)

            store.append('/df3', df[:10])
            store.append('/df3', df[10:])
            tm.assert_frame_equal(store['df3'], df)

            # this is allowed by almost always don't want to do it
            import warnings
            import tables
            warnings.filterwarnings('ignore',
                                    category=tables.NaturalNameWarning)
            store.append('/df3 foo', df[:10])
            store.append('/df3 foo', df[10:])
            tm.assert_frame_equal(store['df3 foo'], df)
            warnings.filterwarnings('always',
                                    category=tables.NaturalNameWarning)

            wp = tm.makePanel()
            store.append('wp1', wp.ix[:, :10, :])
            store.append('wp1', wp.ix[:, 10:, :])
            tm.assert_panel_equal(store['wp1'], wp)

        except:
            raise
        finally:
            store.close()
            os.remove(pth)
예제 #39
0
    def test_store_hierarchical(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['foo', 'bar'])
        frame = DataFrame(np.random.randn(10, 3), index=index,
                          columns=['A', 'B', 'C'])

        self._check_roundtrip(frame, tm.assert_frame_equal)
        self._check_roundtrip(frame.T, tm.assert_frame_equal)
        self._check_roundtrip(frame['A'], tm.assert_series_equal)

        # check that the names are stored
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = frame
            recons = store['frame']
            assert(recons.index.names == ['foo', 'bar'])
        finally:
            store.close()
            os.remove(self.scratchpath)
예제 #40
0
 def hdfRead(self,
             path,
             excode,
             symbol,
             kind1,
             kind2,
             kind3,
             startdate=EXT_Start,
             enddate=EXT_End):
     # kind1为 'Rawdata',Stitch','Indicator'
     # kind2为 '00' '01'
     # kind3为 '1d' '60m' '30m' '15m' '5m' '1m'
     # 读各个频率的Rawdata: kind1='Rawdata',kind2=None,kind3='1d'
     # 读StitchRule:       kind1='Stitch', kind2='00',kind3=None
     # 读STitchData:       kind1='Stitch', kind2='00',kind3='1d'
     # 读Indicator:       kind1='Indicator',kind2='Indicator_name',kind3=None
     store = HDFStore(path, mode='r')
     if kind1 == EXT_Rawdata:
         key = kind1 + '/' + excode + '/' + symbol + '/' + kind3
     elif kind1 == EXT_Stitch:
         key = kind1 + '/' + excode + '/' + symbol + '/' + EXT_Rule + '/' + kind2 if kind3 == None else kind1 + '/' + excode + '/' + symbol + '/' + EXT_Period + '/' + kind3 + '/' + kind2
     elif kind1 == EXT_Indicator:
         key = kind1 + '/' + excode + '/' + symbol + kind2
     else:
         print("kind not supported")
         return
     data = store[key].ix[(
         (store[key].index.get_level_values(0) >= pd.to_datetime(startdate))
         & (store[key].index.get_level_values(0) <= pd.to_datetime(enddate))
     ), :]
     store.close()
     if kind1 == EXT_Indicator:
         f = h5py.File(path, 'r')
         params = f[key].attrs['Params']
         f.close()
         return data, params
     return data
예제 #41
0
class TestHDFStore(unittest.TestCase):
    path = '__test__.h5'
    scratchpath = '__scratch__.h5'

    def setUp(self):
        self.store = HDFStore(self.path)

    def tearDown(self):
        self.store.close()
        os.remove(self.path)

    def test_factory_fun(self):
        try:
            with get_store(self.scratchpath) as tbl:
                raise ValueError('blah')
        except ValueError:
            pass

        with get_store(self.scratchpath) as tbl:
            tbl['a'] = tm.makeDataFrame()

        with get_store(self.scratchpath) as tbl:
            self.assertEquals(len(tbl), 1)
            self.assertEquals(type(tbl['a']), DataFrame)

        os.remove(self.scratchpath)

    def test_keys(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store['b'] = tm.makeStringSeries()
        self.store['c'] = tm.makeDataFrame()
        self.store['d'] = tm.makePanel()
        self.store['foo/bar'] = tm.makePanel()
        self.assertEquals(len(self.store), 5)
        self.assert_(
            set(self.store.keys()) == set(['/a', '/b', '/c', '/d', '/foo/bar'
                                           ]))

    def test_repr(self):
        repr(self.store)
        self.store['a'] = tm.makeTimeSeries()
        self.store['b'] = tm.makeStringSeries()
        self.store['c'] = tm.makeDataFrame()
        self.store['d'] = tm.makePanel()
        self.store['foo/bar'] = tm.makePanel()
        self.store.append('e', tm.makePanel())
        repr(self.store)
        str(self.store)

    def test_contains(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store['b'] = tm.makeDataFrame()
        self.store['foo/bar'] = tm.makeDataFrame()
        self.assert_('a' in self.store)
        self.assert_('b' in self.store)
        self.assert_('c' not in self.store)
        self.assert_('foo/bar' in self.store)
        self.assert_('/foo/bar' in self.store)
        self.assert_('/foo/b' not in self.store)
        self.assert_('bar' not in self.store)

    def test_reopen_handle(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store.open('w', warn=False)
        self.assert_(self.store.handle.isopen)
        self.assertEquals(len(self.store), 0)

    def test_flush(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store.flush()

    def test_get(self):
        self.store['a'] = tm.makeTimeSeries()
        left = self.store.get('a')
        right = self.store['a']
        tm.assert_series_equal(left, right)

        left = self.store.get('/a')
        right = self.store['/a']
        tm.assert_series_equal(left, right)

        self.assertRaises(KeyError, self.store.get, 'b')

    def test_put(self):
        ts = tm.makeTimeSeries()
        df = tm.makeTimeDataFrame()
        self.store['a'] = ts
        self.store['b'] = df[:10]
        self.store['foo/bar/bah'] = df[:10]
        self.store['foo'] = df[:10]
        self.store['/foo'] = df[:10]
        self.store.put('c', df[:10], table=True)

        # not OK, not a table
        self.assertRaises(ValueError,
                          self.store.put,
                          'b',
                          df[10:],
                          append=True)

        # node does not currently exist, test _is_table_type returns False in
        # this case
        self.assertRaises(ValueError,
                          self.store.put,
                          'f',
                          df[10:],
                          append=True)

        # OK
        self.store.put('c', df[10:], append=True)

        # overwrite table
        self.store.put('c', df[:10], table=True, append=False)
        tm.assert_frame_equal(df[:10], self.store['c'])

    def test_put_compression(self):
        df = tm.makeTimeDataFrame()

        self.store.put('c', df, table=True, compression='zlib')
        tm.assert_frame_equal(self.store['c'], df)

        # can't compress if table=False
        self.assertRaises(ValueError,
                          self.store.put,
                          'b',
                          df,
                          table=False,
                          compression='zlib')

    def test_put_compression_blosc(self):
        tm.skip_if_no_package('tables', '2.2', app='blosc support')
        df = tm.makeTimeDataFrame()

        # can't compress if table=False
        self.assertRaises(ValueError,
                          self.store.put,
                          'b',
                          df,
                          table=False,
                          compression='blosc')

        self.store.put('c', df, table=True, compression='blosc')
        tm.assert_frame_equal(self.store['c'], df)

    def test_put_integer(self):
        # non-date, non-string index
        df = DataFrame(np.random.randn(50, 100))
        self._check_roundtrip(df, tm.assert_frame_equal)

    def test_append(self):
        pth = '__test_append__.h5'

        try:
            store = HDFStore(pth)

            df = tm.makeTimeDataFrame()
            store.append('df1', df[:10])
            store.append('df1', df[10:])
            tm.assert_frame_equal(store['df1'], df)

            store.put('df2', df[:10], table=True)
            store.append('df2', df[10:])
            tm.assert_frame_equal(store['df2'], df)

            store.append('/df3', df[:10])
            store.append('/df3', df[10:])
            tm.assert_frame_equal(store['df3'], df)

            # this is allowed by almost always don't want to do it
            import warnings
            import tables
            warnings.filterwarnings('ignore',
                                    category=tables.NaturalNameWarning)
            store.append('/df3 foo', df[:10])
            store.append('/df3 foo', df[10:])
            tm.assert_frame_equal(store['df3 foo'], df)
            warnings.filterwarnings('always',
                                    category=tables.NaturalNameWarning)

            wp = tm.makePanel()
            store.append('wp1', wp.ix[:, :10, :])
            store.append('wp1', wp.ix[:, 10:, :])
            tm.assert_panel_equal(store['wp1'], wp)

        except:
            raise
        finally:
            store.close()
            os.remove(pth)

    def test_append_with_strings(self):
        wp = tm.makePanel()
        wp2 = wp.rename_axis(dict([(x, "%s_extra" % x)
                                   for x in wp.minor_axis]),
                             axis=2)

        self.store.append('s1', wp, min_itemsize=20)
        self.store.append('s1', wp2)
        expected = concat([wp, wp2], axis=2)
        expected = expected.reindex(minor_axis=sorted(expected.minor_axis))
        tm.assert_panel_equal(self.store['s1'], expected)

        # test dict format
        self.store.append('s2', wp, min_itemsize={'column': 20})
        self.store.append('s2', wp2)
        expected = concat([wp, wp2], axis=2)
        expected = expected.reindex(minor_axis=sorted(expected.minor_axis))
        tm.assert_panel_equal(self.store['s2'], expected)

        # apply the wrong field (similar to #1)
        self.store.append('s3', wp, min_itemsize={'index': 20})
        self.assertRaises(Exception, self.store.append, 's3')

        # test truncation of bigger strings
        self.store.append('s4', wp)
        self.assertRaises(Exception, self.store.append, 's4', wp2)

    def test_create_table_index(self):
        wp = tm.makePanel()
        self.store.append('p5', wp)
        self.store.create_table_index('p5')

        assert (self.store.handle.root.p5.table.cols.index.is_indexed == True)
        assert (
            self.store.handle.root.p5.table.cols.column.is_indexed == False)

        df = tm.makeTimeDataFrame()
        self.store.append('f', df[:10])
        self.store.append('f', df[10:])
        self.store.create_table_index('f')

        # create twice
        self.store.create_table_index('f')

        # try to index a non-table
        self.store.put('f2', df)
        self.assertRaises(Exception, self.store.create_table_index, 'f2')

        # try to change the version supports flag
        from pandas.io import pytables
        pytables._table_supports_index = False
        self.assertRaises(Exception, self.store.create_table_index, 'f')

    def test_append_diff_item_order(self):
        wp = tm.makePanel()
        wp1 = wp.ix[:, :10, :]
        wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :]

        self.store.put('panel', wp1, table=True)
        self.assertRaises(Exception, self.store.put, 'panel', wp2, append=True)

    def test_table_index_incompatible_dtypes(self):
        df1 = DataFrame({'a': [1, 2, 3]})
        df2 = DataFrame({'a': [4, 5, 6]},
                        index=date_range('1/1/2000', periods=3))

        self.store.put('frame', df1, table=True)
        self.assertRaises(Exception,
                          self.store.put,
                          'frame',
                          df2,
                          table=True,
                          append=True)

    def test_table_values_dtypes_roundtrip(self):
        df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8')
        self.store.append('df1', df1)
        assert df1.dtypes == self.store['df1'].dtypes

        df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8')
        self.store.append('df2', df2)
        assert df2.dtypes == self.store['df2'].dtypes

        # incompatible dtype
        self.assertRaises(Exception, self.store.append, 'df2', df1)

    def test_table_mixed_dtypes(self):

        # frame
        def _make_one_df():
            df = tm.makeDataFrame()
            df['obj1'] = 'foo'
            df['obj2'] = 'bar'
            df['bool1'] = df['A'] > 0
            df['bool2'] = df['B'] > 0
            df['bool3'] = True
            df['int1'] = 1
            df['int2'] = 2
            return df.consolidate()

        df1 = _make_one_df()

        self.store.append('df1_mixed', df1)
        tm.assert_frame_equal(self.store.select('df1_mixed'), df1)

        # panel
        def _make_one_panel():
            wp = tm.makePanel()
            wp['obj1'] = 'foo'
            wp['obj2'] = 'bar'
            wp['bool1'] = wp['ItemA'] > 0
            wp['bool2'] = wp['ItemB'] > 0
            wp['int1'] = 1
            wp['int2'] = 2
            return wp.consolidate()

        p1 = _make_one_panel()

        self.store.append('p1_mixed', p1)
        tm.assert_panel_equal(self.store.select('p1_mixed'), p1)

    def test_remove(self):
        ts = tm.makeTimeSeries()
        df = tm.makeDataFrame()
        self.store['a'] = ts
        self.store['b'] = df
        self.store.remove('a')
        self.assertEquals(len(self.store), 1)
        tm.assert_frame_equal(df, self.store['b'])

        self.store.remove('b')
        self.assertEquals(len(self.store), 0)

        # pathing
        self.store['a'] = ts
        self.store['b/foo'] = df
        self.store.remove('foo')
        self.store.remove('b/foo')
        self.assertEquals(len(self.store), 1)

        self.store['a'] = ts
        self.store['b/foo'] = df
        self.store.remove('b')
        self.assertEquals(len(self.store), 1)

        # __delitem__
        self.store['a'] = ts
        self.store['b'] = df
        del self.store['a']
        del self.store['b']
        self.assertEquals(len(self.store), 0)

    def test_remove_where(self):

        # non-existance
        crit1 = Term('index', '>', 'foo')
        self.store.remove('a', where=[crit1])

        # try to remove non-table (with crit)
        # non-table ok (where = None)
        wp = tm.makePanel()
        self.store.put('wp', wp, table=True)
        self.store.remove('wp', [('column', ['A', 'D'])])
        rs = self.store.select('wp')
        expected = wp.reindex(minor_axis=['B', 'C'])
        tm.assert_panel_equal(rs, expected)

        # empty where
        self.store.remove('wp')
        self.store.put('wp', wp, table=True)
        self.store.remove('wp', [])

        # non - empty where
        self.store.remove('wp')
        self.store.put('wp', wp, table=True)
        self.assertRaises(Exception, self.store.remove, 'wp', ['foo'])

        # selectin non-table with a where
        self.store.put('wp2', wp, table=False)
        self.assertRaises(Exception, self.store.remove, 'wp2',
                          [('column', ['A', 'D'])])

    def test_remove_crit(self):
        wp = tm.makePanel()
        self.store.put('wp', wp, table=True)
        date = wp.major_axis[len(wp.major_axis) // 2]

        crit1 = Term('index', '>', date)
        crit2 = Term('column', ['A', 'D'])
        self.store.remove('wp', where=[crit1])
        self.store.remove('wp', where=[crit2])
        result = self.store['wp']
        expected = wp.truncate(after=date).reindex(minor=['B', 'C'])
        tm.assert_panel_equal(result, expected)

        # test non-consecutive row removal
        wp = tm.makePanel()
        self.store.put('wp2', wp, table=True)

        date1 = wp.major_axis[1:3]
        date2 = wp.major_axis[5]
        date3 = [wp.major_axis[7], wp.major_axis[9]]

        crit1 = Term('index', date1)
        crit2 = Term('index', date2)
        crit3 = Term('index', date3)

        self.store.remove('wp2', where=[crit1])
        self.store.remove('wp2', where=[crit2])
        self.store.remove('wp2', where=[crit3])
        result = self.store['wp2']

        ma = list(wp.major_axis)
        for d in date1:
            ma.remove(d)
        ma.remove(date2)
        for d in date3:
            ma.remove(d)
        expected = wp.reindex(major=ma)
        tm.assert_panel_equal(result, expected)

    def test_terms(self):

        wp = tm.makePanel()
        self.store.put('wp', wp, table=True)

        # some invalid terms
        terms = [
            ['minor', ['A', 'B']],
            ['index', ['20121114']],
            ['index', ['20121114', '20121114']],
        ]
        for t in terms:
            self.assertRaises(Exception, self.store.select, 'wp', t)

        self.assertRaises(Exception, Term.__init__)
        self.assertRaises(Exception, Term.__init__, 'blah')
        self.assertRaises(Exception, Term.__init__, 'index')
        self.assertRaises(Exception, Term.__init__, 'index', '==')
        self.assertRaises(Exception, Term.__init__, 'index', '>', 5)

        result = self.store.select(
            'wp',
            [Term('major_axis<20000108'),
             Term('minor_axis', '=', ['A', 'B'])])
        expected = wp.truncate(after='20000108').reindex(minor=['A', 'B'])
        tm.assert_panel_equal(result, expected)

        # valid terms
        terms = [
            dict(field='index', op='>', value='20121114'),
            ('index', '20121114'),
            ('index', '>', '20121114'),
            (('index', ['20121114', '20121114']), ),
            ('index', datetime(2012, 11, 14)),
            'index>20121114',
            'major>20121114',
            'major_axis>20121114',
            (('minor', ['A', 'B']), ),
            (('minor_axis', ['A', 'B']), ),
            ((('minor_axis', ['A', 'B']), ), ),
            (('column', ['A', 'B']), ),
        ]

        for t in terms:
            self.store.select('wp', t)

    def test_series(self):
        s = tm.makeStringSeries()
        self._check_roundtrip(s, tm.assert_series_equal)

        ts = tm.makeTimeSeries()
        self._check_roundtrip(ts, tm.assert_series_equal)

        ts2 = Series(ts.index, Index(ts.index, dtype=object))
        self._check_roundtrip(ts2, tm.assert_series_equal)

        ts3 = Series(ts.values,
                     Index(np.asarray(ts.index, dtype=object), dtype=object))
        self._check_roundtrip(ts3, tm.assert_series_equal)

    def test_sparse_series(self):
        s = tm.makeStringSeries()
        s[3:5] = np.nan
        ss = s.to_sparse()
        self._check_roundtrip(ss,
                              tm.assert_series_equal,
                              check_series_type=True)

        ss2 = s.to_sparse(kind='integer')
        self._check_roundtrip(ss2,
                              tm.assert_series_equal,
                              check_series_type=True)

        ss3 = s.to_sparse(fill_value=0)
        self._check_roundtrip(ss3,
                              tm.assert_series_equal,
                              check_series_type=True)

    def test_sparse_frame(self):
        s = tm.makeDataFrame()
        s.ix[3:5, 1:3] = np.nan
        s.ix[8:10, -2] = np.nan
        ss = s.to_sparse()
        self._check_double_roundtrip(ss,
                                     tm.assert_frame_equal,
                                     check_frame_type=True)

        ss2 = s.to_sparse(kind='integer')
        self._check_double_roundtrip(ss2,
                                     tm.assert_frame_equal,
                                     check_frame_type=True)

        ss3 = s.to_sparse(fill_value=0)
        self._check_double_roundtrip(ss3,
                                     tm.assert_frame_equal,
                                     check_frame_type=True)

    def test_sparse_panel(self):
        items = ['x', 'y', 'z']
        p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items))
        sp = p.to_sparse()

        self._check_double_roundtrip(sp,
                                     tm.assert_panel_equal,
                                     check_panel_type=True)

        sp2 = p.to_sparse(kind='integer')
        self._check_double_roundtrip(sp2,
                                     tm.assert_panel_equal,
                                     check_panel_type=True)

        sp3 = p.to_sparse(fill_value=0)
        self._check_double_roundtrip(sp3,
                                     tm.assert_panel_equal,
                                     check_panel_type=True)

    def test_float_index(self):
        # GH #454
        index = np.random.randn(10)
        s = Series(np.random.randn(10), index=index)
        self._check_roundtrip(s, tm.assert_series_equal)

    def test_tuple_index(self):
        # GH #492
        col = np.arange(10)
        idx = [(0., 1.), (2., 3.), (4., 5.)]
        data = np.random.randn(30).reshape((3, 10))
        DF = DataFrame(data, index=idx, columns=col)
        self._check_roundtrip(DF, tm.assert_frame_equal)

    def test_index_types(self):
        values = np.random.randn(2)

        func = lambda l, r: tm.assert_series_equal(l, r, True, True, True)

        ser = Series(values, [0, 'y'])
        self._check_roundtrip(ser, func)

        ser = Series(values, [datetime.today(), 0])
        self._check_roundtrip(ser, func)

        ser = Series(values, ['y', 0])
        self._check_roundtrip(ser, func)

        from datetime import date
        ser = Series(values, [date.today(), 'a'])
        self._check_roundtrip(ser, func)

        ser = Series(values, [1.23, 'b'])
        self._check_roundtrip(ser, func)

        ser = Series(values, [1, 1.53])
        self._check_roundtrip(ser, func)

        ser = Series(values, [1, 5])
        self._check_roundtrip(ser, func)

        ser = Series(values, [datetime(2012, 1, 1), datetime(2012, 1, 2)])
        self._check_roundtrip(ser, func)

    def test_timeseries_preepoch(self):
        if sys.version_info[0] == 2 and sys.version_info[1] < 7:
            raise nose.SkipTest

        dr = bdate_range('1/1/1940', '1/1/1960')
        ts = Series(np.random.randn(len(dr)), index=dr)
        try:
            self._check_roundtrip(ts, tm.assert_series_equal)
        except OverflowError:
            raise nose.SkipTest('known failer on some windows platforms')

    def test_frame(self):
        df = tm.makeDataFrame()

        # put in some random NAs
        df.values[0, 0] = np.nan
        df.values[5, 3] = np.nan

        self._check_roundtrip_table(df, tm.assert_frame_equal)
        self._check_roundtrip(df, tm.assert_frame_equal)

        self._check_roundtrip_table(df,
                                    tm.assert_frame_equal,
                                    compression=True)
        self._check_roundtrip(df, tm.assert_frame_equal, compression=True)

        tdf = tm.makeTimeDataFrame()
        self._check_roundtrip(tdf, tm.assert_frame_equal)
        self._check_roundtrip(tdf, tm.assert_frame_equal, compression=True)

        # not consolidated
        df['foo'] = np.random.randn(len(df))
        self.store['df'] = df
        recons = self.store['df']
        self.assert_(recons._data.is_consolidated())

        # empty
        self._check_roundtrip(df[:0], tm.assert_frame_equal)

    def test_empty_series_frame(self):
        s0 = Series()
        s1 = Series(name='myseries')
        df0 = DataFrame()
        df1 = DataFrame(index=['a', 'b', 'c'])
        df2 = DataFrame(columns=['d', 'e', 'f'])

        self._check_roundtrip(s0, tm.assert_series_equal)
        self._check_roundtrip(s1, tm.assert_series_equal)
        self._check_roundtrip(df0, tm.assert_frame_equal)
        self._check_roundtrip(df1, tm.assert_frame_equal)
        self._check_roundtrip(df2, tm.assert_frame_equal)

    def test_can_serialize_dates(self):
        rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')]
        frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
        self._check_roundtrip(frame, tm.assert_frame_equal)

    def test_timezones(self):
        rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern')
        frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = frame
            recons = store['frame']
            self.assert_(recons.index.equals(rng))
            self.assertEquals(rng.tz, recons.index.tz)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_fixed_offset_tz(self):
        rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00')
        frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = frame
            recons = store['frame']
            self.assert_(recons.index.equals(rng))
            self.assertEquals(rng.tz, recons.index.tz)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_hierarchical(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['foo', 'bar'])
        frame = DataFrame(np.random.randn(10, 3),
                          index=index,
                          columns=['A', 'B', 'C'])

        self._check_roundtrip(frame, tm.assert_frame_equal)
        self._check_roundtrip(frame.T, tm.assert_frame_equal)
        self._check_roundtrip(frame['A'], tm.assert_series_equal)

        # check that the names are stored
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = frame
            recons = store['frame']
            assert (recons.index.names == ['foo', 'bar'])
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_index_name(self):
        df = tm.makeDataFrame()
        df.index.name = 'foo'
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = df
            recons = store['frame']
            assert (recons.index.name == 'foo')
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_series_name(self):
        df = tm.makeDataFrame()
        series = df['A']

        try:
            store = HDFStore(self.scratchpath)
            store['series'] = series
            recons = store['series']
            assert (recons.name == 'A')
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_mixed(self):
        def _make_one():
            df = tm.makeDataFrame()
            df['obj1'] = 'foo'
            df['obj2'] = 'bar'
            df['bool1'] = df['A'] > 0
            df['bool2'] = df['B'] > 0
            df['int1'] = 1
            df['int2'] = 2
            return df.consolidate()

        df1 = _make_one()
        df2 = _make_one()

        self._check_roundtrip(df1, tm.assert_frame_equal)
        self._check_roundtrip(df2, tm.assert_frame_equal)

        self.store['obj'] = df1
        tm.assert_frame_equal(self.store['obj'], df1)
        self.store['obj'] = df2
        tm.assert_frame_equal(self.store['obj'], df2)

        # check that can store Series of all of these types
        self._check_roundtrip(df1['obj1'], tm.assert_series_equal)
        self._check_roundtrip(df1['bool1'], tm.assert_series_equal)
        self._check_roundtrip(df1['int1'], tm.assert_series_equal)

        # try with compression
        self._check_roundtrip(df1['obj1'],
                              tm.assert_series_equal,
                              compression=True)
        self._check_roundtrip(df1['bool1'],
                              tm.assert_series_equal,
                              compression=True)
        self._check_roundtrip(df1['int1'],
                              tm.assert_series_equal,
                              compression=True)
        self._check_roundtrip(df1, tm.assert_frame_equal, compression=True)

    def test_wide(self):
        wp = tm.makePanel()
        self._check_roundtrip(wp, tm.assert_panel_equal)

    def test_wide_table(self):
        wp = tm.makePanel()
        self._check_roundtrip_table(wp, tm.assert_panel_equal)

    def test_wide_table_dups(self):
        wp = tm.makePanel()
        try:
            store = HDFStore(self.scratchpath)
            store._quiet = True
            store.put('panel', wp, table=True)
            store.put('panel', wp, table=True, append=True)
            recons = store['panel']
            tm.assert_panel_equal(recons, wp)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_long(self):
        def _check(left, right):
            tm.assert_panel_equal(left.to_panel(), right.to_panel())

        wp = tm.makePanel()
        self._check_roundtrip(wp.to_frame(), _check)

        # empty
        # self._check_roundtrip(wp.to_frame()[:0], _check)

    def test_longpanel(self):
        pass

    def test_overwrite_node(self):
        self.store['a'] = tm.makeTimeDataFrame()
        ts = tm.makeTimeSeries()
        self.store['a'] = ts

        tm.assert_series_equal(self.store['a'], ts)

    def test_select(self):
        wp = tm.makePanel()

        # put/select ok
        self.store.put('wp', wp, table=True)
        self.store.select('wp')

        # non-table ok (where = None)
        self.store.put('wp2', wp, table=False)
        self.store.select('wp2')

        # selectin non-table with a where
        self.assertRaises(Exception, self.store.select, 'wp2',
                          ('column', ['A', 'D']))

    def test_panel_select(self):
        wp = tm.makePanel()
        self.store.put('wp', wp, table=True)
        date = wp.major_axis[len(wp.major_axis) // 2]

        crit1 = ('index', '>=', date)
        crit2 = ('column', '=', ['A', 'D'])

        result = self.store.select('wp', [crit1, crit2])
        expected = wp.truncate(before=date).reindex(minor=['A', 'D'])
        tm.assert_panel_equal(result, expected)

        result = self.store.select(
            'wp', ['major_axis>=20000124', ('minor_axis', '=', ['A', 'B'])])
        expected = wp.truncate(before='20000124').reindex(minor=['A', 'B'])
        tm.assert_panel_equal(result, expected)

    def test_frame_select(self):
        df = tm.makeTimeDataFrame()
        self.store.put('frame', df, table=True)
        date = df.index[len(df) // 2]

        crit1 = ('index', '>=', date)
        crit2 = ('column', ['A', 'D'])
        crit3 = ('column', 'A')

        result = self.store.select('frame', [crit1, crit2])
        expected = df.ix[date:, ['A', 'D']]
        tm.assert_frame_equal(result, expected)

        result = self.store.select('frame', [crit3])
        expected = df.ix[:, ['A']]
        tm.assert_frame_equal(result, expected)

        # can't select if not written as table
        self.store['frame'] = df
        self.assertRaises(Exception, self.store.select, 'frame',
                          [crit1, crit2])

    def test_select_filter_corner(self):
        df = DataFrame(np.random.randn(50, 100))
        df.index = ['%.3d' % c for c in df.index]
        df.columns = ['%.3d' % c for c in df.columns]
        self.store.put('frame', df, table=True)

        crit = Term('column', df.columns[:75])
        result = self.store.select('frame', [crit])
        tm.assert_frame_equal(result, df.ix[:, df.columns[:75]])

    def _check_roundtrip(self, obj, comparator, compression=False, **kwargs):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store['obj'] = obj
            retrieved = store['obj']
            comparator(retrieved, obj, **kwargs)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def _check_double_roundtrip(self,
                                obj,
                                comparator,
                                compression=False,
                                **kwargs):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store['obj'] = obj
            retrieved = store['obj']
            comparator(retrieved, obj, **kwargs)
            store['obj'] = retrieved
            again = store['obj']
            comparator(again, obj, **kwargs)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def _check_roundtrip_table(self, obj, comparator, compression=False):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store.put('obj', obj, table=True)
            retrieved = store['obj']
            # sorted_obj = _test_sort(obj)
            comparator(retrieved, obj)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_legacy_read(self):
        pth = curpath()
        store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r')
        store['a']
        store['b']
        store['c']
        store['d']
        store.close()

    def test_legacy_table_read(self):
        # legacy table types
        pth = curpath()
        store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r')
        store.select('df1')
        store.select('df2')
        store.select('wp1')
        store.close()

    def test_legacy_table_write(self):
        # legacy table types
        pth = curpath()
        df = tm.makeDataFrame()
        wp = tm.makePanel()

        store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'a')

        self.assertRaises(Exception, store.append, 'df1', df)
        self.assertRaises(Exception, store.append, 'wp1', wp)

        store.close()

    def test_store_datetime_fractional_secs(self):
        dt = datetime(2012, 1, 2, 3, 4, 5, 123456)
        series = Series([0], [dt])
        self.store['a'] = series
        self.assertEquals(self.store['a'].index[0], dt)

    def test_tseries_indices_series(self):
        idx = tm.makeDateIndex(10)
        ser = Series(np.random.randn(len(idx)), idx)
        self.store['a'] = ser
        result = self.store['a']

        assert_series_equal(result, ser)
        self.assertEquals(type(result.index), type(ser.index))
        self.assertEquals(result.index.freq, ser.index.freq)

        idx = tm.makePeriodIndex(10)
        ser = Series(np.random.randn(len(idx)), idx)
        self.store['a'] = ser
        result = self.store['a']

        assert_series_equal(result, ser)
        self.assertEquals(type(result.index), type(ser.index))
        self.assertEquals(result.index.freq, ser.index.freq)

    def test_tseries_indices_frame(self):
        idx = tm.makeDateIndex(10)
        df = DataFrame(np.random.randn(len(idx), 3), index=idx)
        self.store['a'] = df
        result = self.store['a']

        assert_frame_equal(result, df)
        self.assertEquals(type(result.index), type(df.index))
        self.assertEquals(result.index.freq, df.index.freq)

        idx = tm.makePeriodIndex(10)
        df = DataFrame(np.random.randn(len(idx), 3), idx)
        self.store['a'] = df
        result = self.store['a']

        assert_frame_equal(result, df)
        self.assertEquals(type(result.index), type(df.index))
        self.assertEquals(result.index.freq, df.index.freq)

    def test_unicode_index(self):
        unicode_values = [u'\u03c3', u'\u03c3\u03c3']

        s = Series(np.random.randn(len(unicode_values)), unicode_values)
        self._check_roundtrip(s, tm.assert_series_equal)

    def test_store_datetime_mixed(self):
        df = DataFrame({
            'a': [1, 2, 3],
            'b': [1., 2., 3.],
            'c': ['a', 'b', 'c']
        })
        ts = tm.makeTimeSeries()
        df['d'] = ts.index[:3]
        self._check_roundtrip(df, tm.assert_frame_equal)

    def test_cant_write_multiindex_table(self):
        # for now, #1848
        df = DataFrame(
            np.random.randn(10, 4),
            index=[np.arange(5).repeat(2),
                   np.tile(np.arange(2), 5)])

        self.assertRaises(Exception, self.store.put, 'foo', df, table=True)
예제 #42
0
def del_col_from_summary(colname):
    store = HDFStore('_data_/ProteinDataStore.h5')
    data_summary = store['DataBases_Summary']
    data_summary = data_summary.drop(colname, 1)
    store['DataBases_Summary'] = data_summary
    store.close()
예제 #43
0
def filter_columns(storename, dataframe, collist):
    store = HDFStore(storename)
    df = store[dataframe]
    df = df[collist]
    store[dataframe] = df
    store.close()
예제 #44
0
class Importer:
    """Main Class for importing mat files.

    Data will be stored in the samples and targets of the ds dictionary
    attribute and can be loaded or saved from and to a compressed hdf5 file.

    Attributes:
        dataroot: directory that contains the mat files
        ds: trivial dictionary containing:
            samples: a pandas dataframe with a MultiIndex composed of the session data
            targets: a pandas dataframe with the targets/labels
    """
    def __init__(self, dataroot):
        self.dataroot = dataroot
        self.ds = None
        self.store = None
        self.importpath = path.join(self.dataroot, 'imported')

    def __append(self, session):
        """append session to current object"""
        if (self.ds.samples.columns.get_level_values('channel') !=
                session.ds.samples.columns.get_level_values('channel')).any():
            print(self.ds.samples.columns.get_level_values('channel'))
            print(session.ds.samples.columns.get_level_values('channel'))
            raise InconsistentElectrodes(
                'electrode labels do not match when merging datasets')

        self.ds.samples = self.ds.samples.append(session.ds.samples,
                                                 verify_integrity=True)
        self.ds.targets = self.ds.targets.append(session.ds.targets,
                                                 verify_integrity=True)

    def __sort(self):
        """MultiIndex Slicing operations require we sort all indices"""
        self.ds.samples.sort_index(level='channel', axis=1, inplace=True)
        #self.ds.samples.sort_index(axis=1, inplace=True)

    def get_session(self, subject, sessionid):
        """Add a single trial as target/samples pair from a mat file.

        Args:
            param1: (string): subject ID
            param2: (string): session ID

        Returns:
            A Session object containing samples and target data.
        """

        trialpath = glob(
            path.join(self.dataroot, subject + '-' + sessionid + '-*.mat'))
        if not trialpath or not path.exists(trialpath[0]):
            raise FileNotFoundError(
                "no file for subject '{0}' and trial '{1}'".format(
                    subject, sessionid))

        session = io.loadmat(trialpath[0])
        """
        >> session
        session =
            data: [1x1 struct]
        """

        data = session['data'][0, 0]
        """
        >> session.data
        ans =
                    label: {64x1 cell}
                     time: {1x638 cell}
                    trial: {1x638 cell}
                     elec: [1x1 struct]
                      cfg: [1x1 struct]
                  TrlInfo: {638x16 cell}
            TrlInfoLabels: {16x1 cell}
        """
        channels = data[0][:, 0]  # label
        channels = np.array(channels.tolist()).flatten()  # unify dtype
        samples = data[2][0, :]  # trial
        samples = np.array([x[:].flatten() for x in samples], dtype='float32')
        cfg = data[4][0][0]
        trlinfo = data[5][:, :]
        trlinfolabels = data[6][:, 0]
        """
        >> session.data.cfg
        ans =
                       method: 'spline'
                   badchannel: {2x1 cell}
                       trials: 'all'
                       lambda: 1.0000e-05
                        order: 4
                         elec: [1x1 struct]
            outputfilepresent: 'overwrite'
                     callinfo: [1x1 struct]
                      version: [1x1 struct]
                  trackconfig: 'off'
                  checkconfig: 'loose'
                    checksize: 100000
                 showcallinfo: 'yes'
                        debug: 'no'
                trackcallinfo: 'yes'
                trackdatainfo: 'no'
               missingchannel: {0x1 cell}
                     previous: [1x1 struct]
        """
        try:
            badchannels = cfg[1][0, 0]
        except IndexError:
            badchannels = []
        """
        >> session.data.TrlInfoLabels
        ans =
            'time stamp original (EEG)'
            'time stamp new (EEG)'
            'task'
            'data part #'
            'trial #'
            'stimulus type'
            'EEG trigger'
            'encoding digit #'
            'time stamp original (E-Prime)'
            'set size'
            'probe type'
            'response'
            'ACC'
            'RT'
            'digit/probe presented'
            'probe position'
        """
        trials = trlinfo[:, 4].astype('uint8')
        digits = trlinfo[:, 14].astype('uint8')

        return Session(subject, sessionid, samples, digits, trials, channels)

    def add_session(self, subject, sessionid):
        """Concatenate a single Session to the current importer instance
        implicitly using __append().

        Args:
            param1: (string): subject ID
            param2: (string): session ID
        """
        session = self.get_session(subject, sessionid)
        if not self.ds:
            self.ds = dotdict({
                'samples': session.ds.samples,
                'targets': session.ds.targets
            })
            return

        if sessionid in self.ds.samples.index.get_level_values('session'):
            warnings.warn("Session already added, doing nothing.")
            return

        if subject not in self.ds.samples.index.get_level_values('subject'):
            raise UnmatchedSubjects(
                "Subjects don't match, will not add current session")
        # TODO: other checks ?

        self.__append(session)

    def import_all(self, subject):
        """Import all .mat files for a subject ID.

        Args:
            param1: (string): subject ID
        """
        trialpath = path.join(self.dataroot, '*' + subject + '*mat')
        trialfiles = sorted(glob(trialpath))
        if not trialfiles:
            raise FileNotFoundError(trialpath)

        sessionid_re = re.compile('.*' + subject + '-([0-9]+)-.*mat')
        sessionids = [
            sessionid_re.match(file).groups()[0] for file in trialfiles
        ]
        for id in sessionids:
            self.add_session(subject, id)

        self.__sort()

    def save(self, filename, force=False):
        """Save the trials and samples arrays from the current importer
        instance to a dataset inside a lzf compressed hdf5 file for later use.

        Args:
            param1: (string): filename, will be stored in self.importpath

        Optional Args:
            force: (boolean) Wether or not to overwrite an existing file
                             (default: False)
        """
        try:
            mkdir(self.importpath)
        except FileExistsError:
            pass

        filename = path.join(self.importpath, filename)
        if path.exists(filename):
            if force:
                unlink(filename)
            else:
                raise FileExistsError('Import file "' + filename +
                                      '" already exists.')

        self.__sort()
        self.store = HDFStore(filename, complib='lzo')
        self.store['samples'] = self.ds.samples
        self.store['targets'] = self.ds.targets
        self.store.close()

    def load(self, name):
        """Load a hdf5 file created with save() and attach the targets and
        samples array to the current importer instance.

        Args:
            param1: (string): a name for the dataset and the hdf5 file name
        """
        self.open(name)
        self.ds = dotdict({'samples': None, 'targets': None})
        self.ds.samples = self.store['samples']
        self.ds.targets = self.store['targets']
        self.store.close()

    def open(self, name):
        if not path.exists(self.importpath):
            raise FileNotFoundError(path.join(self.dataroot, 'imported'))
        filename = path.join(self.importpath, name)
        if not path.exists(filename):
            raise FileExistsError(filename)
        self.store = HDFStore(filename)

    def close(self, name):
        self.store.close()
예제 #45
0
class TestHDFStore(unittest.TestCase):
    path = '__test__.h5'
    scratchpath = '__scratch__.h5'

    def setUp(self):
        self.store = HDFStore(self.path)

    def tearDown(self):
        self.store.close()
        os.remove(self.path)

    def test_factory_fun(self):
        try:
            with get_store(self.scratchpath) as tbl:
                raise ValueError('blah')
        except ValueError:
            pass

        with get_store(self.scratchpath) as tbl:
            tbl['a'] = tm.makeDataFrame()

        with get_store(self.scratchpath) as tbl:
            self.assertEquals(len(tbl), 1)
            self.assertEquals(type(tbl['a']), DataFrame)

        os.remove(self.scratchpath)

    def test_len_keys(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store['b'] = tm.makeStringSeries()
        self.store['c'] = tm.makeDataFrame()
        self.store['d'] = tm.makePanel()
        self.assertEquals(len(self.store), 4)
        self.assert_(set(self.store.keys()) == set(['a', 'b', 'c', 'd']))

    def test_repr(self):
        repr(self.store)
        self.store['a'] = tm.makeTimeSeries()
        self.store['b'] = tm.makeStringSeries()
        self.store['c'] = tm.makeDataFrame()
        self.store['d'] = tm.makePanel()
        repr(self.store)

    def test_contains(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store['b'] = tm.makeDataFrame()
        self.assert_('a' in self.store)
        self.assert_('b' in self.store)
        self.assert_('c' not in self.store)

    def test_reopen_handle(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store.open('w', warn=False)
        self.assert_(self.store.handle.isopen)
        self.assertEquals(len(self.store), 0)

    def test_flush(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store.flush()

    def test_get(self):
        self.store['a'] = tm.makeTimeSeries()
        left = self.store.get('a')
        right = self.store['a']
        tm.assert_series_equal(left, right)

        self.assertRaises(KeyError, self.store.get, 'b')

    def test_put(self):
        ts = tm.makeTimeSeries()
        df = tm.makeTimeDataFrame()
        self.store['a'] = ts
        self.store['b'] = df[:10]
        self.store.put('c', df[:10], table=True)

        # not OK, not a table
        self.assertRaises(ValueError, self.store.put, 'b', df[10:], append=True)

        # node does not currently exist, test _is_table_type returns False in
        # this case
        self.assertRaises(ValueError, self.store.put, 'f', df[10:], append=True)

        # OK
        self.store.put('c', df[10:], append=True)

        # overwrite table
        self.store.put('c', df[:10], table=True, append=False)
        tm.assert_frame_equal(df[:10], self.store['c'])

    def test_put_compression(self):
        df = tm.makeTimeDataFrame()

        self.store.put('c', df, table=True, compression='zlib')
        tm.assert_frame_equal(self.store['c'], df)

        # can't compress if table=False
        self.assertRaises(ValueError, self.store.put, 'b', df,
                          table=False, compression='zlib')

    def test_put_compression_blosc(self):
        tm.skip_if_no_package('tables', '2.2', app='blosc support')
        df = tm.makeTimeDataFrame()

        # can't compress if table=False
        self.assertRaises(ValueError, self.store.put, 'b', df,
                          table=False, compression='blosc')

        self.store.put('c', df, table=True, compression='blosc')
        tm.assert_frame_equal(self.store['c'], df)

    def test_put_integer(self):
        # non-date, non-string index
        df = DataFrame(np.random.randn(50, 100))
        self._check_roundtrip(df, tm.assert_frame_equal)

    def test_append(self):
        df = tm.makeTimeDataFrame()
        self.store.put('c', df[:10], table=True)
        self.store.append('c', df[10:])
        tm.assert_frame_equal(self.store['c'], df)

    def test_append_diff_item_order(self):
        wp = tm.makePanel()
        wp1 = wp.ix[:, :10, :]
        wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :]

        self.store.put('panel', wp1, table=True)
        self.assertRaises(Exception, self.store.put, 'panel', wp2,
                          append=True)

    def test_remove(self):
        ts = tm.makeTimeSeries()
        df = tm.makeDataFrame()
        self.store['a'] = ts
        self.store['b'] = df
        self.store.remove('a')
        self.assertEquals(len(self.store), 1)
        tm.assert_frame_equal(df, self.store['b'])

        self.store.remove('b')
        self.assertEquals(len(self.store), 0)

    def test_remove_where_not_exist(self):
        crit1 = {
            'field' : 'index',
            'op' : '>',
            'value' : 'foo'
        }
        self.store.remove('a', where=[crit1])

    def test_remove_crit(self):
        wp = tm.makePanel()
        self.store.put('wp', wp, table=True)
        date = wp.major_axis[len(wp.major_axis) // 2]

        crit1 = {
            'field' : 'index',
            'op' : '>',
            'value' : date
        }
        crit2 = {
            'field' : 'column',
            'value' : ['A', 'D']
        }
        self.store.remove('wp', where=[crit1])
        self.store.remove('wp', where=[crit2])
        result = self.store['wp']
        expected = wp.truncate(after=date).reindex(minor=['B', 'C'])
        tm.assert_panel_equal(result, expected)

    def test_series(self):
        s = tm.makeStringSeries()
        self._check_roundtrip(s, tm.assert_series_equal)

        ts = tm.makeTimeSeries()
        self._check_roundtrip(ts, tm.assert_series_equal)

        ts2 = Series(ts.index, Index(ts.index, dtype=object))
        self._check_roundtrip(ts2, tm.assert_series_equal)

        ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object),
                                      dtype=object))
        self._check_roundtrip(ts3, tm.assert_series_equal)

    def test_sparse_series(self):
        s = tm.makeStringSeries()
        s[3:5] = np.nan
        ss = s.to_sparse()
        self._check_roundtrip(ss, tm.assert_series_equal,
                              check_series_type=True)

        ss2 = s.to_sparse(kind='integer')
        self._check_roundtrip(ss2, tm.assert_series_equal,
                              check_series_type=True)

        ss3 = s.to_sparse(fill_value=0)
        self._check_roundtrip(ss3, tm.assert_series_equal,
                              check_series_type=True)

    def test_sparse_frame(self):
        s = tm.makeDataFrame()
        s.ix[3:5, 1:3] = np.nan
        s.ix[8:10, -2] = np.nan
        ss = s.to_sparse()
        self._check_double_roundtrip(ss, tm.assert_frame_equal,
                                     check_frame_type=True)

        ss2 = s.to_sparse(kind='integer')
        self._check_double_roundtrip(ss2, tm.assert_frame_equal,
                                     check_frame_type=True)

        ss3 = s.to_sparse(fill_value=0)
        self._check_double_roundtrip(ss3, tm.assert_frame_equal,
                                     check_frame_type=True)

    def test_sparse_panel(self):
        items = ['x', 'y', 'z']
        p = Panel(dict((i, tm.makeDataFrame()) for i in items))
        sp = p.to_sparse()

        self._check_double_roundtrip(sp, tm.assert_panel_equal,
                                     check_panel_type=True)

        sp2 = p.to_sparse(kind='integer')
        self._check_double_roundtrip(sp2, tm.assert_panel_equal,
                                     check_panel_type=True)

        sp3 = p.to_sparse(fill_value=0)
        self._check_double_roundtrip(sp3, tm.assert_panel_equal,
                                     check_panel_type=True)

    def test_float_index(self):
        # GH #454
        index = np.random.randn(10)
        s = Series(np.random.randn(10), index=index)
        self._check_roundtrip(s, tm.assert_series_equal)

    def test_tuple_index(self):
        # GH #492
        col = np.arange(10)
        idx = [(0.,1.), (2., 3.), (4., 5.)]
        data = np.random.randn(30).reshape((3, 10))
        DF = DataFrame(data, index=idx, columns=col)
        self._check_roundtrip(DF, tm.assert_frame_equal)

    def test_index_types(self):
        values = np.random.randn(2)

        func = lambda l, r : tm.assert_series_equal(l, r, True, True, True)

        ser = Series(values, [0, 'y'])
        self._check_roundtrip(ser, func)

        ser = Series(values, [datetime.today(), 0])
        self._check_roundtrip(ser, func)

        ser = Series(values, ['y', 0])
        self._check_roundtrip(ser, func)

        from datetime import date
        ser = Series(values, [date.today(), 'a'])
        self._check_roundtrip(ser, func)

        ser = Series(values, [1.23, 'b'])
        self._check_roundtrip(ser, func)

        ser = Series(values, [1, 1.53])
        self._check_roundtrip(ser, func)

        ser = Series(values, [1, 5])
        self._check_roundtrip(ser, func)

        ser = Series(values, [datetime(2012, 1, 1), datetime(2012, 1, 2)])
        self._check_roundtrip(ser, func)

    def test_timeseries_preepoch(self):
        if sys.version_info[0] == 2 and sys.version_info[1] < 7:
            raise nose.SkipTest

        dr = bdate_range('1/1/1940', '1/1/1960')
        ts = Series(np.random.randn(len(dr)), index=dr)
        try:
            self._check_roundtrip(ts, tm.assert_series_equal)
        except OverflowError:
            raise nose.SkipTest('known failer on some windows platforms')

    def test_frame(self):
        df = tm.makeDataFrame()

        # put in some random NAs
        df.values[0, 0] = np.nan
        df.values[5, 3] = np.nan

        self._check_roundtrip_table(df, tm.assert_frame_equal)
        self._check_roundtrip(df, tm.assert_frame_equal)

        self._check_roundtrip_table(df, tm.assert_frame_equal,
                                    compression=True)
        self._check_roundtrip(df, tm.assert_frame_equal,
                                    compression=True)

        tdf = tm.makeTimeDataFrame()
        self._check_roundtrip(tdf, tm.assert_frame_equal)
        self._check_roundtrip(tdf, tm.assert_frame_equal,
                              compression=True)

        # not consolidated
        df['foo'] = np.random.randn(len(df))
        self.store['df'] = df
        recons = self.store['df']
        self.assert_(recons._data.is_consolidated())

        # empty
        self._check_roundtrip(df[:0], tm.assert_frame_equal)

    def test_empty_series_frame(self):
        s0 = Series()
        s1 = Series(name='myseries')
        df0 = DataFrame()
        df1 = DataFrame(index=['a', 'b', 'c'])
        df2 = DataFrame(columns=['d', 'e', 'f'])

        self._check_roundtrip(s0, tm.assert_series_equal)
        self._check_roundtrip(s1, tm.assert_series_equal)
        self._check_roundtrip(df0, tm.assert_frame_equal)
        self._check_roundtrip(df1, tm.assert_frame_equal)
        self._check_roundtrip(df2, tm.assert_frame_equal)

    def test_can_serialize_dates(self):
        rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')]
        frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
        self._check_roundtrip(frame, tm.assert_frame_equal)

    def test_timezones(self):
        rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern')
        frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = frame
            recons = store['frame']
            self.assert_(recons.index.equals(rng))
            self.assertEquals(rng.tz, recons.index.tz)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_hierarchical(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['foo', 'bar'])
        frame = DataFrame(np.random.randn(10, 3), index=index,
                          columns=['A', 'B', 'C'])

        self._check_roundtrip(frame, tm.assert_frame_equal)
        self._check_roundtrip(frame.T, tm.assert_frame_equal)
        self._check_roundtrip(frame['A'], tm.assert_series_equal)

        # check that the names are stored
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = frame
            recons = store['frame']
            assert(recons.index.names == ['foo', 'bar'])
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_index_name(self):
        df = tm.makeDataFrame()
        df.index.name = 'foo'
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = df
            recons = store['frame']
            assert(recons.index.name == 'foo')
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_series_name(self):
        df = tm.makeDataFrame()
        series = df['A']

        try:
            store = HDFStore(self.scratchpath)
            store['series'] = series
            recons = store['series']
            assert(recons.name == 'A')
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_mixed(self):
        def _make_one():
            df = tm.makeDataFrame()
            df['obj1'] = 'foo'
            df['obj2'] = 'bar'
            df['bool1'] = df['A'] > 0
            df['bool2'] = df['B'] > 0
            df['int1'] = 1
            df['int2'] = 2
            return df.consolidate()

        df1 = _make_one()
        df2 = _make_one()

        self._check_roundtrip(df1, tm.assert_frame_equal)
        self._check_roundtrip(df2, tm.assert_frame_equal)

        self.store['obj'] = df1
        tm.assert_frame_equal(self.store['obj'], df1)
        self.store['obj'] = df2
        tm.assert_frame_equal(self.store['obj'], df2)

        # storing in Table not yet supported
        self.assertRaises(Exception, self.store.put, 'foo',
                          df1, table=True)

        # check that can store Series of all of these types
        self._check_roundtrip(df1['obj1'], tm.assert_series_equal)
        self._check_roundtrip(df1['bool1'], tm.assert_series_equal)
        self._check_roundtrip(df1['int1'], tm.assert_series_equal)

        # try with compression
        self._check_roundtrip(df1['obj1'], tm.assert_series_equal,
                              compression=True)
        self._check_roundtrip(df1['bool1'], tm.assert_series_equal,
                              compression=True)
        self._check_roundtrip(df1['int1'], tm.assert_series_equal,
                              compression=True)
        self._check_roundtrip(df1, tm.assert_frame_equal,
                              compression=True)

    def test_wide(self):
        wp = tm.makePanel()
        self._check_roundtrip(wp, tm.assert_panel_equal)

    def test_wide_table(self):
        wp = tm.makePanel()
        self._check_roundtrip_table(wp, tm.assert_panel_equal)

    def test_wide_table_dups(self):
        wp = tm.makePanel()
        try:
            store = HDFStore(self.scratchpath)
            store._quiet = True
            store.put('panel', wp, table=True)
            store.put('panel', wp, table=True, append=True)
            recons = store['panel']
            tm.assert_panel_equal(recons, wp)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_long(self):
        def _check(left, right):
            tm.assert_panel_equal(left.to_panel(), right.to_panel())

        wp = tm.makePanel()
        self._check_roundtrip(wp.to_frame(), _check)

        # empty
        # self._check_roundtrip(wp.to_frame()[:0], _check)

    def test_longpanel(self):
        pass

    def test_overwrite_node(self):
        self.store['a'] = tm.makeTimeDataFrame()
        ts = tm.makeTimeSeries()
        self.store['a'] = ts

        tm.assert_series_equal(self.store['a'], ts)

    def test_panel_select(self):
        wp = tm.makePanel()
        self.store.put('wp', wp, table=True)
        date = wp.major_axis[len(wp.major_axis) // 2]

        crit1 = {
            'field' : 'index',
            'op' : '>=',
            'value' : date
        }
        crit2 = {
            'field' : 'column',
            'value' : ['A', 'D']
        }

        result = self.store.select('wp', [crit1, crit2])
        expected = wp.truncate(before=date).reindex(minor=['A', 'D'])
        tm.assert_panel_equal(result, expected)

    def test_frame_select(self):
        df = tm.makeTimeDataFrame()
        self.store.put('frame', df, table=True)
        date = df.index[len(df) // 2]

        crit1 = {
            'field' : 'index',
            'op' : '>=',
            'value' : date
        }
        crit2 = {
            'field' : 'column',
            'value' : ['A', 'D']
        }
        crit3 = {
            'field' : 'column',
            'value' : 'A'
        }

        result = self.store.select('frame', [crit1, crit2])
        expected = df.ix[date:, ['A', 'D']]
        tm.assert_frame_equal(result, expected)

        result = self.store.select('frame', [crit3])
        expected = df.ix[:, ['A']]
        tm.assert_frame_equal(result, expected)

        # can't select if not written as table
        self.store['frame'] = df
        self.assertRaises(Exception, self.store.select,
                          'frame', [crit1, crit2])

    def test_select_filter_corner(self):
        df = DataFrame(np.random.randn(50, 100))
        df.index = ['%.3d' % c for c in df.index]
        df.columns = ['%.3d' % c for c in df.columns]
        self.store.put('frame', df, table=True)

        crit = {
            'field' : 'column',
            'value' : df.columns[:75]
        }
        result = self.store.select('frame', [crit])
        tm.assert_frame_equal(result, df.ix[:, df.columns[:75]])

    def _check_roundtrip(self, obj, comparator, compression=False, **kwargs):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store['obj'] = obj
            retrieved = store['obj']
            comparator(retrieved, obj, **kwargs)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def _check_double_roundtrip(self, obj, comparator, compression=False,
                                **kwargs):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store['obj'] = obj
            retrieved = store['obj']
            comparator(retrieved, obj, **kwargs)
            store['obj'] = retrieved
            again = store['obj']
            comparator(again, obj, **kwargs)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def _check_roundtrip_table(self, obj, comparator, compression=False):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store.put('obj', obj, table=True)
            retrieved = store['obj']
            sorted_obj = _test_sort(obj)
            comparator(retrieved, sorted_obj)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_legacy_read(self):
        pth = curpath()
        store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r')
        store['a']
        store['b']
        store['c']
        store['d']
        store.close()

    def test_store_datetime_fractional_secs(self):
        dt = datetime(2012, 1, 2, 3, 4, 5, 123456)
        series = Series([0], [dt])
        self.store['a'] = series
        self.assertEquals(self.store['a'].index[0], dt)

    def test_tseries_indices_series(self):
        idx = tm.makeDateIndex(10)
        ser = Series(np.random.randn(len(idx)), idx)
        self.store['a'] = ser
        result = self.store['a']

        assert_series_equal(result, ser)
        self.assertEquals(type(result.index), type(ser.index))
        self.assertEquals(result.index.freq, ser.index.freq)

        idx = tm.makePeriodIndex(10)
        ser = Series(np.random.randn(len(idx)), idx)
        self.store['a'] = ser
        result = self.store['a']

        assert_series_equal(result, ser)
        self.assertEquals(type(result.index), type(ser.index))
        self.assertEquals(result.index.freq, ser.index.freq)

    def test_tseries_indices_frame(self):
        idx = tm.makeDateIndex(10)
        df = DataFrame(np.random.randn(len(idx), 3), index=idx)
        self.store['a'] = df
        result = self.store['a']

        assert_frame_equal(result, df)
        self.assertEquals(type(result.index), type(df.index))
        self.assertEquals(result.index.freq, df.index.freq)

        idx = tm.makePeriodIndex(10)
        df = DataFrame(np.random.randn(len(idx), 3), idx)
        self.store['a'] = df
        result = self.store['a']

        assert_frame_equal(result, df)
        self.assertEquals(type(result.index), type(df.index))
        self.assertEquals(result.index.freq, df.index.freq)

    def test_unicode_index(self):
        unicode_values = [u'\u03c3', u'\u03c3\u03c3']

        s = Series(np.random.randn(len(unicode_values)), unicode_values)
        self._check_roundtrip(s, tm.assert_series_equal)

    def test_store_datetime_mixed(self):
        df = DataFrame({'a': [1,2,3], 'b': [1.,2.,3.], 'c': ['a', 'b', 'c']})
        ts = tm.makeTimeSeries()
        df['d'] = ts.index[:3]
        self._check_roundtrip(df, tm.assert_frame_equal)

    def test_cant_write_multiindex_table(self):
        # for now, #1848
        df = DataFrame(np.random.randn(10, 4),
                       index=[np.arange(5).repeat(2),
                              np.tile(np.arange(2), 5)])

        self.assertRaises(Exception, self.store.put, 'foo', df, table=True)
예제 #46
0
    def hdfWrite(self, path, excode, symbol, indata, kind1, kind2, kind3):
        # kind1为 'Rawdata'、'Stitch'、'Indicator'
        # kind2为 '00' '01'
        # kind3为 '1d' '60m' '30m' '15m' '5m' '1m'
        # 写各个频率的Rawdata: kind1='Rawdata',kind2=None,kind3='1d'
        # 写StitchRule:       kind1='Stitch', kind2='00',kind3=None
        # 写StitchData:       kind1='Stitch', kind2='00',kind3='1d'
        # 写Indicator:       kind1='Indicator',kind2='Indicator_name',kind3='params'
        store = HDFStore(path, mode='a')
        if kind1 == EXT_Rawdata:
            key = '/'.join([kind1, excode, symbol, kind3])
        elif kind1 == EXT_Stitch:
            key = '/'.join([kind1, excode, symbol, EXT_Rule, kind2
                            ]) if kind3 == None else '/'.join([
                                kind1, excode, symbol, EXT_Period, kind3, kind2
                            ])
        elif kind1 == EXT_Indicator:
            key = '/'.join([kind1, excode, symbol, kind2])
        else:
            print("kind not supported")
            return

        if kind1 == EXT_Indicator:
            f = h5py.File(path, 'a')
            try:
                store[key]
            except KeyError:  # 路径不存在时创建
                store[key] = indata
                for param_names, value in kind3.items():
                    f[key].attrs['param_names'] = value
            else:
                temp = 0
                try:
                    f[key].attrs[[i for i in kind3.keys()][0]]  #不存在该参数
                except KeyError:
                    store[key] = indata
                    for param_names, value in kind3.items():
                        f[key].attrs['param_names'] = value
                else:
                    for param_names, value in kind3.items():
                        temp = (f[key].attrs[param_names] != value) + temp
                    if temp == 0:  #Params匹配时合并
                        adddata = indata[~indata.index.isin(store[key].index)]
                        store.append(key, adddata)
                    else:  # Params不匹配时覆盖
                        store[key] = indata
                        for param_names, value in kind3.items():
                            f[key].attrs['param_names'] = value
            f.close()
            store.close()
        else:
            try:
                store[key]
            except KeyError:
                store[key] = indata
            else:
                adddata = indata[~indata.index.isin(store[key].index)]
                if kind2 in [EXT_Series_00, EXT_Series_01]:
                    adddata[EXT_AdjFactor] = adddata[EXT_AdjFactor] * store[
                        key][EXT_AdjFactor].iloc[-1] / adddata[
                            EXT_AdjFactor].iloc[0]
                store.append(key, adddata)
            store.close()
예제 #47
0
class TestHDFStore(unittest.TestCase):
    path = '__test__.h5'
    scratchpath = '__scratch__.h5'

    def setUp(self):
        self.store = HDFStore(self.path)

    def tearDown(self):
        self.store.close()
        os.remove(self.path)

    def test_factory_fun(self):
        try:
            with get_store(self.scratchpath) as tbl:
                raise ValueError('blah')
        except ValueError:
            pass

        with get_store(self.scratchpath) as tbl:
            tbl['a'] = tm.makeDataFrame()

        with get_store(self.scratchpath) as tbl:
            self.assertEquals(len(tbl), 1)
            self.assertEquals(type(tbl['a']), DataFrame)

        os.remove(self.scratchpath)

    def test_keys(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store['b'] = tm.makeStringSeries()
        self.store['c'] = tm.makeDataFrame()
        self.store['d'] = tm.makePanel()
        self.store['foo/bar'] = tm.makePanel()
        self.assertEquals(len(self.store), 5)
        self.assert_(
            set(self.store.keys()) == set(['/a', '/b', '/c', '/d', '/foo/bar'
                                           ]))

    def test_repr(self):
        repr(self.store)
        self.store['a'] = tm.makeTimeSeries()
        self.store['b'] = tm.makeStringSeries()
        self.store['c'] = tm.makeDataFrame()
        self.store['d'] = tm.makePanel()
        self.store['foo/bar'] = tm.makePanel()
        self.store.append('e', tm.makePanel())
        repr(self.store)
        str(self.store)

    def test_contains(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store['b'] = tm.makeDataFrame()
        self.store['foo/bar'] = tm.makeDataFrame()
        self.assert_('a' in self.store)
        self.assert_('b' in self.store)
        self.assert_('c' not in self.store)
        self.assert_('foo/bar' in self.store)
        self.assert_('/foo/bar' in self.store)
        self.assert_('/foo/b' not in self.store)
        self.assert_('bar' not in self.store)

    def test_versioning(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store['b'] = tm.makeDataFrame()
        df = tm.makeTimeDataFrame()
        self.store.remove('df1')
        self.store.append('df1', df[:10])
        self.store.append('df1', df[10:])
        self.assert_(self.store.root.a._v_attrs.pandas_version == '0.10')
        self.assert_(self.store.root.b._v_attrs.pandas_version == '0.10')
        self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.10')

        # write a file and wipe its versioning
        self.store.remove('df2')
        self.store.append('df2', df)
        self.store.get_node('df2')._v_attrs.pandas_version = None
        self.store.select('df2')
        self.store.select('df2', [Term('index', '>', df.index[2])])

    def test_meta(self):
        raise nose.SkipTest('no meta')

        meta = {'foo': ['I love pandas ']}
        s = tm.makeTimeSeries()
        s.meta = meta
        self.store['a'] = s
        self.assert_(self.store['a'].meta == meta)

        df = tm.makeDataFrame()
        df.meta = meta
        self.store['b'] = df
        self.assert_(self.store['b'].meta == meta)

        # this should work, but because slicing doesn't propgate meta it doesn
        self.store.remove('df1')
        self.store.append('df1', df[:10])
        self.store.append('df1', df[10:])
        results = self.store['df1']
        #self.assert_(getattr(results,'meta',None) == meta)

        # no meta
        df = tm.makeDataFrame()
        self.store['b'] = df
        self.assert_(hasattr(self.store['b'], 'meta') == False)

    def test_reopen_handle(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store.open('w', warn=False)
        self.assert_(self.store.handle.isopen)
        self.assertEquals(len(self.store), 0)

    def test_flush(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store.flush()

    def test_get(self):
        self.store['a'] = tm.makeTimeSeries()
        left = self.store.get('a')
        right = self.store['a']
        tm.assert_series_equal(left, right)

        left = self.store.get('/a')
        right = self.store['/a']
        tm.assert_series_equal(left, right)

        self.assertRaises(KeyError, self.store.get, 'b')

    def test_put(self):
        ts = tm.makeTimeSeries()
        df = tm.makeTimeDataFrame()
        self.store['a'] = ts
        self.store['b'] = df[:10]
        self.store['foo/bar/bah'] = df[:10]
        self.store['foo'] = df[:10]
        self.store['/foo'] = df[:10]
        self.store.put('c', df[:10], table=True)

        # not OK, not a table
        self.assertRaises(ValueError,
                          self.store.put,
                          'b',
                          df[10:],
                          append=True)

        # node does not currently exist, test _is_table_type returns False in
        # this case
        self.assertRaises(ValueError,
                          self.store.put,
                          'f',
                          df[10:],
                          append=True)

        # OK
        self.store.put('c', df[10:], append=True)

        # overwrite table
        self.store.put('c', df[:10], table=True, append=False)
        tm.assert_frame_equal(df[:10], self.store['c'])

    def test_put_string_index(self):

        index = Index(
            ["I am a very long string index: %s" % i for i in range(20)])
        s = Series(np.arange(20), index=index)
        df = DataFrame({'A': s, 'B': s})

        self.store['a'] = s
        tm.assert_series_equal(self.store['a'], s)

        self.store['b'] = df
        tm.assert_frame_equal(self.store['b'], df)

        # mixed length
        index = Index(
            ['abcdefghijklmnopqrstuvwxyz1234567890'] +
            ["I am a very long string index: %s" % i for i in range(20)])
        s = Series(np.arange(21), index=index)
        df = DataFrame({'A': s, 'B': s})
        self.store['a'] = s
        tm.assert_series_equal(self.store['a'], s)

        self.store['b'] = df
        tm.assert_frame_equal(self.store['b'], df)

    def test_put_compression(self):
        df = tm.makeTimeDataFrame()

        self.store.put('c', df, table=True, compression='zlib')
        tm.assert_frame_equal(self.store['c'], df)

        # can't compress if table=False
        self.assertRaises(ValueError,
                          self.store.put,
                          'b',
                          df,
                          table=False,
                          compression='zlib')

    def test_put_compression_blosc(self):
        tm.skip_if_no_package('tables', '2.2', app='blosc support')
        df = tm.makeTimeDataFrame()

        # can't compress if table=False
        self.assertRaises(ValueError,
                          self.store.put,
                          'b',
                          df,
                          table=False,
                          compression='blosc')

        self.store.put('c', df, table=True, compression='blosc')
        tm.assert_frame_equal(self.store['c'], df)

    def test_put_integer(self):
        # non-date, non-string index
        df = DataFrame(np.random.randn(50, 100))
        self._check_roundtrip(df, tm.assert_frame_equal)

    def test_append(self):

        df = tm.makeTimeDataFrame()
        self.store.remove('df1')
        self.store.append('df1', df[:10])
        self.store.append('df1', df[10:])
        tm.assert_frame_equal(self.store['df1'], df)

        self.store.remove('df2')
        self.store.put('df2', df[:10], table=True)
        self.store.append('df2', df[10:])
        tm.assert_frame_equal(self.store['df2'], df)

        self.store.remove('df3')
        self.store.append('/df3', df[:10])
        self.store.append('/df3', df[10:])
        tm.assert_frame_equal(self.store['df3'], df)

        # this is allowed by almost always don't want to do it
        warnings.filterwarnings('ignore', category=tables.NaturalNameWarning)
        self.store.remove('/df3 foo')
        self.store.append('/df3 foo', df[:10])
        self.store.append('/df3 foo', df[10:])
        tm.assert_frame_equal(self.store['df3 foo'], df)
        warnings.filterwarnings('always', category=tables.NaturalNameWarning)

        # panel
        wp = tm.makePanel()
        self.store.remove('wp1')
        self.store.append('wp1', wp.ix[:, :10, :])
        self.store.append('wp1', wp.ix[:, 10:, :])
        tm.assert_panel_equal(self.store['wp1'], wp)

        # ndim
        p4d = tm.makePanel4D()
        self.store.remove('p4d')
        self.store.append('p4d', p4d.ix[:, :, :10, :])
        self.store.append('p4d', p4d.ix[:, :, 10:, :])
        tm.assert_panel4d_equal(self.store['p4d'], p4d)

        # test using axis labels
        self.store.remove('p4d')
        self.store.append('p4d',
                          p4d.ix[:, :, :10, :],
                          axes=['items', 'major_axis', 'minor_axis'])
        self.store.append('p4d',
                          p4d.ix[:, :, 10:, :],
                          axes=['items', 'major_axis', 'minor_axis'])
        tm.assert_panel4d_equal(self.store['p4d'], p4d)

        # test using differnt number of items on each axis
        p4d2 = p4d.copy()
        p4d2['l4'] = p4d['l1']
        p4d2['l5'] = p4d['l1']
        self.store.remove('p4d2')
        self.store.append('p4d2',
                          p4d2,
                          axes=['items', 'major_axis', 'minor_axis'])
        tm.assert_panel4d_equal(self.store['p4d2'], p4d2)

        # test using differt order of items on the non-index axes
        self.store.remove('wp1')
        wp_append1 = wp.ix[:, :10, :]
        self.store.append('wp1', wp_append1)
        wp_append2 = wp.ix[:, 10:, :].reindex(items=wp.items[::-1])
        self.store.append('wp1', wp_append2)
        tm.assert_panel_equal(self.store['wp1'], wp)

    def test_append_frame_column_oriented(self):

        # column oriented
        df = tm.makeTimeDataFrame()
        self.store.remove('df1')
        self.store.append('df1', df.ix[:, :2], axes=['columns'])
        self.store.append('df1', df.ix[:, 2:])
        tm.assert_frame_equal(self.store['df1'], df)

        result = self.store.select('df1', 'columns=A')
        expected = df.reindex(columns=['A'])
        tm.assert_frame_equal(expected, result)

        # this isn't supported
        self.assertRaises(Exception, self.store.select, 'df1',
                          ('columns=A', Term('index', '>', df.index[4])))

        # selection on the non-indexable
        result = self.store.select(
            'df1', ('columns=A', Term('index', '=', df.index[0:4])))
        expected = df.reindex(columns=['A'], index=df.index[0:4])
        tm.assert_frame_equal(expected, result)

    def test_ndim_indexables(self):
        """ test using ndim tables in new ways"""

        p4d = tm.makePanel4D()

        def check_indexers(key, indexers):
            for i, idx in enumerate(indexers):
                self.assert_(
                    getattr(
                        getattr(self.store.root, key).table.description,
                        idx)._v_pos == i)

        # append then change (will take existing schema)
        indexers = ['items', 'major_axis', 'minor_axis']

        self.store.remove('p4d')
        self.store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
        self.store.append('p4d', p4d.ix[:, :, 10:, :])
        tm.assert_panel4d_equal(self.store.select('p4d'), p4d)
        check_indexers('p4d', indexers)

        # same as above, but try to append with differnt axes
        self.store.remove('p4d')
        self.store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
        self.store.append('p4d',
                          p4d.ix[:, :, 10:, :],
                          axes=['labels', 'items', 'major_axis'])
        tm.assert_panel4d_equal(self.store.select('p4d'), p4d)
        check_indexers('p4d', indexers)

        # pass incorrect number of axes
        self.store.remove('p4d')
        self.assertRaises(Exception,
                          self.store.append,
                          'p4d',
                          p4d.ix[:, :, :10, :],
                          axes=['major_axis', 'minor_axis'])

        # different than default indexables #1
        indexers = ['labels', 'major_axis', 'minor_axis']
        self.store.remove('p4d')
        self.store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
        self.store.append('p4d', p4d.ix[:, :, 10:, :])
        tm.assert_panel4d_equal(self.store['p4d'], p4d)
        check_indexers('p4d', indexers)

        # different than default indexables #2
        indexers = ['major_axis', 'labels', 'minor_axis']
        self.store.remove('p4d')
        self.store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers)
        self.store.append('p4d', p4d.ix[:, :, 10:, :])
        tm.assert_panel4d_equal(self.store['p4d'], p4d)
        check_indexers('p4d', indexers)

        # partial selection
        result = self.store.select('p4d', ['labels=l1'])
        expected = p4d.reindex(labels=['l1'])
        tm.assert_panel4d_equal(result, expected)

        # partial selection2
        result = self.store.select(
            'p4d',
            [Term('labels=l1'),
             Term('items=ItemA'),
             Term('minor_axis=B')])
        expected = p4d.reindex(labels=['l1'],
                               items=['ItemA'],
                               minor_axis=['B'])
        tm.assert_panel4d_equal(result, expected)

        # non-existant partial selection
        result = self.store.select(
            'p4d',
            [Term('labels=l1'),
             Term('items=Item1'),
             Term('minor_axis=B')])
        expected = p4d.reindex(labels=['l1'], items=[], minor_axis=['B'])
        tm.assert_panel4d_equal(result, expected)

    def test_append_with_strings(self):
        wp = tm.makePanel()
        wp2 = wp.rename_axis(dict([(x, "%s_extra" % x)
                                   for x in wp.minor_axis]),
                             axis=2)

        self.store.append('s1', wp, min_itemsize=20)
        self.store.append('s1', wp2)
        expected = concat([wp, wp2], axis=2)
        expected = expected.reindex(minor_axis=sorted(expected.minor_axis))
        tm.assert_panel_equal(self.store['s1'], expected)

        # test dict format
        self.store.append('s2', wp, min_itemsize={'minor_axis': 20})
        self.store.append('s2', wp2)
        expected = concat([wp, wp2], axis=2)
        expected = expected.reindex(minor_axis=sorted(expected.minor_axis))
        tm.assert_panel_equal(self.store['s2'], expected)

        # apply the wrong field (similar to #1)
        self.store.append('s3', wp, min_itemsize={'major_axis': 20})
        self.assertRaises(Exception, self.store.append, 's3')

        # test truncation of bigger strings
        self.store.append('s4', wp)
        self.assertRaises(Exception, self.store.append, 's4', wp2)

        # avoid truncation on elements
        df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']])
        self.store.append('df_big', df, min_itemsize={'values': 1024})
        tm.assert_frame_equal(self.store.select('df_big'), df)

        # appending smaller string ok
        df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']])
        self.store.append('df_big', df2)
        expected = concat([df, df2])
        tm.assert_frame_equal(self.store.select('df_big'), expected)

        # avoid truncation on elements
        df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']])
        self.store.append('df_big2', df, min_itemsize={'values': 10})
        tm.assert_frame_equal(self.store.select('df_big2'), df)

        # bigger string on next append
        self.store.append('df_new', df, min_itemsize={'values': 16})
        df_new = DataFrame([[124, 'abcdefqhij'],
                            [346, 'abcdefghijklmnopqrtsuvwxyz']])
        self.assertRaises(Exception, self.store.append, 'df_new', df_new)

    def test_create_table_index(self):
        wp = tm.makePanel()
        self.store.append('p5', wp)
        self.store.create_table_index('p5')

        assert (
            self.store.handle.root.p5.table.cols.major_axis.is_indexed == True)
        assert (self.store.handle.root.p5.table.cols.minor_axis.is_indexed ==
                False)

        # default optlevels
        assert (self.store.handle.root.p5.table.cols.major_axis.index.optlevel
                == 6)
        assert (self.store.handle.root.p5.table.cols.major_axis.index.kind ==
                'medium')

        # let's change the indexing scheme
        self.store.create_table_index('p5')
        assert (self.store.handle.root.p5.table.cols.major_axis.index.optlevel
                == 6)
        assert (self.store.handle.root.p5.table.cols.major_axis.index.kind ==
                'medium')
        self.store.create_table_index('p5', optlevel=9)
        assert (self.store.handle.root.p5.table.cols.major_axis.index.optlevel
                == 9)
        assert (self.store.handle.root.p5.table.cols.major_axis.index.kind ==
                'medium')
        self.store.create_table_index('p5', kind='full')
        assert (self.store.handle.root.p5.table.cols.major_axis.index.optlevel
                == 9)
        assert (self.store.handle.root.p5.table.cols.major_axis.index.kind ==
                'full')
        self.store.create_table_index('p5', optlevel=1, kind='light')
        assert (self.store.handle.root.p5.table.cols.major_axis.index.optlevel
                == 1)
        assert (self.store.handle.root.p5.table.cols.major_axis.index.kind ==
                'light')

        df = tm.makeTimeDataFrame()
        self.store.append('f', df[:10])
        self.store.append('f', df[10:])
        self.store.create_table_index('f')

        # try to index a non-table
        self.store.put('f2', df)
        self.assertRaises(Exception, self.store.create_table_index, 'f2')

        # try to change the version supports flag
        from pandas.io import pytables
        pytables._table_supports_index = False
        self.assertRaises(Exception, self.store.create_table_index, 'f')

        # test out some versions
        original = tables.__version__

        for v in ['2.2', '2.2b']:
            pytables._table_mod = None
            pytables._table_supports_index = False
            tables.__version__ = v
            self.assertRaises(Exception, self.store.create_table_index, 'f')

        for v in ['2.3.1', '2.3.1b', '2.4dev', '2.4', original]:
            pytables._table_mod = None
            pytables._table_supports_index = False
            tables.__version__ = v
            self.store.create_table_index('f')
        pytables._table_mod = None
        pytables._table_supports_index = False
        tables.__version__ = original

    def test_big_table(self):
        raise nose.SkipTest('no big table')

        # create and write a big table
        wp = Panel(np.random.randn(20, 1000, 1000),
                   items=['Item%s' % i for i in xrange(20)],
                   major_axis=date_range('1/1/2000', periods=1000),
                   minor_axis=['E%s' % i for i in xrange(1000)])

        wp.ix[:, 100:200, 300:400] = np.nan

        try:
            store = HDFStore(self.scratchpath)
            store._debug_memory = True
            store.append('wp', wp)
            recons = store.select('wp')
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_append_diff_item_order(self):
        raise nose.SkipTest('append diff item order')

        wp = tm.makePanel()
        wp1 = wp.ix[:, :10, :]
        wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :]

        self.store.put('panel', wp1, table=True)
        self.assertRaises(Exception, self.store.put, 'panel', wp2, append=True)

    def test_table_index_incompatible_dtypes(self):
        df1 = DataFrame({'a': [1, 2, 3]})
        df2 = DataFrame({'a': [4, 5, 6]},
                        index=date_range('1/1/2000', periods=3))

        self.store.put('frame', df1, table=True)
        self.assertRaises(Exception,
                          self.store.put,
                          'frame',
                          df2,
                          table=True,
                          append=True)

    def test_table_values_dtypes_roundtrip(self):
        df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8')
        self.store.append('df1', df1)
        assert df1.dtypes == self.store['df1'].dtypes

        df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8')
        self.store.append('df2', df2)
        assert df2.dtypes == self.store['df2'].dtypes

        # incompatible dtype
        self.assertRaises(Exception, self.store.append, 'df2', df1)

    def test_table_mixed_dtypes(self):

        # frame
        def _make_one_df():
            df = tm.makeDataFrame()
            df['obj1'] = 'foo'
            df['obj2'] = 'bar'
            df['bool1'] = df['A'] > 0
            df['bool2'] = df['B'] > 0
            df['bool3'] = True
            df['int1'] = 1
            df['int2'] = 2
            return df.consolidate()

        df1 = _make_one_df()

        self.store.append('df1_mixed', df1)
        tm.assert_frame_equal(self.store.select('df1_mixed'), df1)

        # panel
        def _make_one_panel():
            wp = tm.makePanel()
            wp['obj1'] = 'foo'
            wp['obj2'] = 'bar'
            wp['bool1'] = wp['ItemA'] > 0
            wp['bool2'] = wp['ItemB'] > 0
            wp['int1'] = 1
            wp['int2'] = 2
            return wp.consolidate()

        p1 = _make_one_panel()

        self.store.append('p1_mixed', p1)
        tm.assert_panel_equal(self.store.select('p1_mixed'), p1)

        # ndim
        def _make_one_p4d():
            wp = tm.makePanel4D()
            wp['obj1'] = 'foo'
            wp['obj2'] = 'bar'
            wp['bool1'] = wp['l1'] > 0
            wp['bool2'] = wp['l2'] > 0
            wp['int1'] = 1
            wp['int2'] = 2
            return wp.consolidate()

        p4d = _make_one_p4d()
        self.store.append('p4d_mixed', p4d)
        tm.assert_panel4d_equal(self.store.select('p4d_mixed'), p4d)

    def test_remove(self):
        ts = tm.makeTimeSeries()
        df = tm.makeDataFrame()
        self.store['a'] = ts
        self.store['b'] = df
        self.store.remove('a')
        self.assertEquals(len(self.store), 1)
        tm.assert_frame_equal(df, self.store['b'])

        self.store.remove('b')
        self.assertEquals(len(self.store), 0)

        # pathing
        self.store['a'] = ts
        self.store['b/foo'] = df
        self.store.remove('foo')
        self.store.remove('b/foo')
        self.assertEquals(len(self.store), 1)

        self.store['a'] = ts
        self.store['b/foo'] = df
        self.store.remove('b')
        self.assertEquals(len(self.store), 1)

        # __delitem__
        self.store['a'] = ts
        self.store['b'] = df
        del self.store['a']
        del self.store['b']
        self.assertEquals(len(self.store), 0)

    def test_remove_where(self):

        # non-existance
        crit1 = Term('index', '>', 'foo')
        self.store.remove('a', where=[crit1])

        # try to remove non-table (with crit)
        # non-table ok (where = None)
        wp = tm.makePanel()
        self.store.put('wp', wp, table=True)
        self.store.remove('wp', [('minor_axis', ['A', 'D'])])
        rs = self.store.select('wp')
        expected = wp.reindex(minor_axis=['B', 'C'])
        tm.assert_panel_equal(rs, expected)

        # empty where
        self.store.remove('wp')
        self.store.put('wp', wp, table=True)

        # deleted number (entire table)
        n = self.store.remove('wp', [])
        assert (n == 120)

        # non - empty where
        self.store.remove('wp')
        self.store.put('wp', wp, table=True)
        self.assertRaises(Exception, self.store.remove, 'wp', ['foo'])

        # selectin non-table with a where
        #self.store.put('wp2', wp, table=False)
        #self.assertRaises(Exception, self.store.remove,
        #                  'wp2', [('column', ['A', 'D'])])

    def test_remove_crit(self):
        wp = tm.makePanel()

        # group row removal
        date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10])
        crit4 = Term('major_axis', date4)
        self.store.put('wp3', wp, table=True)
        n = self.store.remove('wp3', where=[crit4])
        assert (n == 36)
        result = self.store.select('wp3')
        expected = wp.reindex(major_axis=wp.major_axis - date4)
        tm.assert_panel_equal(result, expected)

        # upper half
        self.store.put('wp', wp, table=True)
        date = wp.major_axis[len(wp.major_axis) // 2]

        crit1 = Term('major_axis', '>', date)
        crit2 = Term('minor_axis', ['A', 'D'])
        n = self.store.remove('wp', where=[crit1])

        assert (n == 56)

        n = self.store.remove('wp', where=[crit2])
        assert (n == 32)

        result = self.store['wp']
        expected = wp.truncate(after=date).reindex(minor=['B', 'C'])
        tm.assert_panel_equal(result, expected)

        # individual row elements
        self.store.put('wp2', wp, table=True)

        date1 = wp.major_axis[1:3]
        crit1 = Term('major_axis', date1)
        self.store.remove('wp2', where=[crit1])
        result = self.store.select('wp2')
        expected = wp.reindex(major_axis=wp.major_axis - date1)
        tm.assert_panel_equal(result, expected)

        date2 = wp.major_axis[5]
        crit2 = Term('major_axis', date2)
        self.store.remove('wp2', where=[crit2])
        result = self.store['wp2']
        expected = wp.reindex(major_axis=wp.major_axis - date1 -
                              Index([date2]))
        tm.assert_panel_equal(result, expected)

        date3 = [wp.major_axis[7], wp.major_axis[9]]
        crit3 = Term('major_axis', date3)
        self.store.remove('wp2', where=[crit3])
        result = self.store['wp2']
        expected = wp.reindex(major_axis=wp.major_axis - date1 -
                              Index([date2]) - Index(date3))
        tm.assert_panel_equal(result, expected)

        # corners
        self.store.put('wp4', wp, table=True)
        n = self.store.remove(
            'wp4', where=[Term('major_axis', '>', wp.major_axis[-1])])
        result = self.store.select('wp4')
        tm.assert_panel_equal(result, wp)

    def test_terms(self):

        wp = tm.makePanel()
        p4d = tm.makePanel4D()
        self.store.put('wp', wp, table=True)
        self.store.put('p4d', p4d, table=True)

        # some invalid terms
        terms = [
            ['minor', ['A', 'B']],
            ['index', ['20121114']],
            ['index', ['20121114', '20121114']],
        ]
        for t in terms:
            self.assertRaises(Exception, self.store.select, 'wp', t)

        self.assertRaises(Exception, Term.__init__)
        self.assertRaises(Exception, Term.__init__, 'blah')
        self.assertRaises(Exception, Term.__init__, 'index')
        self.assertRaises(Exception, Term.__init__, 'index', '==')
        self.assertRaises(Exception, Term.__init__, 'index', '>', 5)

        # panel
        result = self.store.select(
            'wp',
            [Term('major_axis<20000108'),
             Term('minor_axis', '=', ['A', 'B'])])
        expected = wp.truncate(after='20000108').reindex(minor=['A', 'B'])
        tm.assert_panel_equal(result, expected)

        # p4d
        result = self.store.select('p4d', [
            Term('major_axis<20000108'),
            Term('minor_axis', '=', ['A', 'B']),
            Term('items', '=', ['ItemA', 'ItemB'])
        ])
        expected = p4d.truncate(after='20000108').reindex(
            minor=['A', 'B'], items=['ItemA', 'ItemB'])
        tm.assert_panel4d_equal(result, expected)

        # valid terms
        terms = [
            dict(field='major_axis', op='>', value='20121114'),
            ('major_axis', '20121114'),
            ('major_axis', '>', '20121114'),
            (('major_axis', ['20121114', '20121114']), ),
            ('major_axis', datetime(2012, 11, 14)),
            'major_axis>20121114',
            'major_axis>20121114',
            'major_axis>20121114',
            (('minor_axis', ['A', 'B']), ),
            (('minor_axis', ['A', 'B']), ),
            ((('minor_axis', ['A', 'B']), ), ),
            (('items', ['ItemA', 'ItemB']), ),
            ('items=ItemA'),
        ]

        for t in terms:
            self.store.select('wp', t)
            self.store.select('p4d', t)

        # valid for p4d only
        terms = [
            (('labels', '=', ['l1', 'l2']), ),
            Term('labels', '=', ['l1', 'l2']),
        ]

        for t in terms:
            self.store.select('p4d', t)

    def test_series(self):
        s = tm.makeStringSeries()
        self._check_roundtrip(s, tm.assert_series_equal)

        ts = tm.makeTimeSeries()
        self._check_roundtrip(ts, tm.assert_series_equal)

        ts2 = Series(ts.index, Index(ts.index, dtype=object))
        self._check_roundtrip(ts2, tm.assert_series_equal)

        ts3 = Series(ts.values,
                     Index(np.asarray(ts.index, dtype=object), dtype=object))
        self._check_roundtrip(ts3, tm.assert_series_equal)

    def test_sparse_series(self):
        s = tm.makeStringSeries()
        s[3:5] = np.nan
        ss = s.to_sparse()
        self._check_roundtrip(ss,
                              tm.assert_series_equal,
                              check_series_type=True)

        ss2 = s.to_sparse(kind='integer')
        self._check_roundtrip(ss2,
                              tm.assert_series_equal,
                              check_series_type=True)

        ss3 = s.to_sparse(fill_value=0)
        self._check_roundtrip(ss3,
                              tm.assert_series_equal,
                              check_series_type=True)

    def test_sparse_frame(self):
        s = tm.makeDataFrame()
        s.ix[3:5, 1:3] = np.nan
        s.ix[8:10, -2] = np.nan
        ss = s.to_sparse()
        self._check_double_roundtrip(ss,
                                     tm.assert_frame_equal,
                                     check_frame_type=True)

        ss2 = s.to_sparse(kind='integer')
        self._check_double_roundtrip(ss2,
                                     tm.assert_frame_equal,
                                     check_frame_type=True)

        ss3 = s.to_sparse(fill_value=0)
        self._check_double_roundtrip(ss3,
                                     tm.assert_frame_equal,
                                     check_frame_type=True)

    def test_sparse_panel(self):
        items = ['x', 'y', 'z']
        p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items))
        sp = p.to_sparse()

        self._check_double_roundtrip(sp,
                                     tm.assert_panel_equal,
                                     check_panel_type=True)

        sp2 = p.to_sparse(kind='integer')
        self._check_double_roundtrip(sp2,
                                     tm.assert_panel_equal,
                                     check_panel_type=True)

        sp3 = p.to_sparse(fill_value=0)
        self._check_double_roundtrip(sp3,
                                     tm.assert_panel_equal,
                                     check_panel_type=True)

    def test_float_index(self):
        # GH #454
        index = np.random.randn(10)
        s = Series(np.random.randn(10), index=index)
        self._check_roundtrip(s, tm.assert_series_equal)

    def test_tuple_index(self):
        # GH #492
        col = np.arange(10)
        idx = [(0., 1.), (2., 3.), (4., 5.)]
        data = np.random.randn(30).reshape((3, 10))
        DF = DataFrame(data, index=idx, columns=col)
        self._check_roundtrip(DF, tm.assert_frame_equal)

    def test_index_types(self):
        values = np.random.randn(2)

        func = lambda l, r: tm.assert_series_equal(l, r, True, True, True)

        ser = Series(values, [0, 'y'])
        self._check_roundtrip(ser, func)

        ser = Series(values, [datetime.today(), 0])
        self._check_roundtrip(ser, func)

        ser = Series(values, ['y', 0])
        self._check_roundtrip(ser, func)

        from datetime import date
        ser = Series(values, [date.today(), 'a'])
        self._check_roundtrip(ser, func)

        ser = Series(values, [1.23, 'b'])
        self._check_roundtrip(ser, func)

        ser = Series(values, [1, 1.53])
        self._check_roundtrip(ser, func)

        ser = Series(values, [1, 5])
        self._check_roundtrip(ser, func)

        ser = Series(values, [datetime(2012, 1, 1), datetime(2012, 1, 2)])
        self._check_roundtrip(ser, func)

    def test_timeseries_preepoch(self):
        if sys.version_info[0] == 2 and sys.version_info[1] < 7:
            raise nose.SkipTest

        dr = bdate_range('1/1/1940', '1/1/1960')
        ts = Series(np.random.randn(len(dr)), index=dr)
        try:
            self._check_roundtrip(ts, tm.assert_series_equal)
        except OverflowError:
            raise nose.SkipTest('known failer on some windows platforms')

    def test_frame(self):
        df = tm.makeDataFrame()

        # put in some random NAs
        df.values[0, 0] = np.nan
        df.values[5, 3] = np.nan

        self._check_roundtrip_table(df, tm.assert_frame_equal)
        self._check_roundtrip(df, tm.assert_frame_equal)

        self._check_roundtrip_table(df,
                                    tm.assert_frame_equal,
                                    compression=True)
        self._check_roundtrip(df, tm.assert_frame_equal, compression=True)

        tdf = tm.makeTimeDataFrame()
        self._check_roundtrip(tdf, tm.assert_frame_equal)
        self._check_roundtrip(tdf, tm.assert_frame_equal, compression=True)

        # not consolidated
        df['foo'] = np.random.randn(len(df))
        self.store['df'] = df
        recons = self.store['df']
        self.assert_(recons._data.is_consolidated())

        # empty
        self._check_roundtrip(df[:0], tm.assert_frame_equal)

    def test_empty_series_frame(self):
        s0 = Series()
        s1 = Series(name='myseries')
        df0 = DataFrame()
        df1 = DataFrame(index=['a', 'b', 'c'])
        df2 = DataFrame(columns=['d', 'e', 'f'])

        self._check_roundtrip(s0, tm.assert_series_equal)
        self._check_roundtrip(s1, tm.assert_series_equal)
        self._check_roundtrip(df0, tm.assert_frame_equal)
        self._check_roundtrip(df1, tm.assert_frame_equal)
        self._check_roundtrip(df2, tm.assert_frame_equal)

    def test_can_serialize_dates(self):
        rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')]
        frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
        self._check_roundtrip(frame, tm.assert_frame_equal)

    def test_timezones(self):
        rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern')
        frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = frame
            recons = store['frame']
            self.assert_(recons.index.equals(rng))
            self.assertEquals(rng.tz, recons.index.tz)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_fixed_offset_tz(self):
        rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00')
        frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = frame
            recons = store['frame']
            self.assert_(recons.index.equals(rng))
            self.assertEquals(rng.tz, recons.index.tz)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_hierarchical(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['foo', 'bar'])
        frame = DataFrame(np.random.randn(10, 3),
                          index=index,
                          columns=['A', 'B', 'C'])

        self._check_roundtrip(frame, tm.assert_frame_equal)
        self._check_roundtrip(frame.T, tm.assert_frame_equal)
        self._check_roundtrip(frame['A'], tm.assert_series_equal)

        # check that the names are stored
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = frame
            recons = store['frame']
            assert (recons.index.names == ['foo', 'bar'])
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_index_name(self):
        df = tm.makeDataFrame()
        df.index.name = 'foo'
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = df
            recons = store['frame']
            assert (recons.index.name == 'foo')
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_series_name(self):
        df = tm.makeDataFrame()
        series = df['A']

        try:
            store = HDFStore(self.scratchpath)
            store['series'] = series
            recons = store['series']
            assert (recons.name == 'A')
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_mixed(self):
        def _make_one():
            df = tm.makeDataFrame()
            df['obj1'] = 'foo'
            df['obj2'] = 'bar'
            df['bool1'] = df['A'] > 0
            df['bool2'] = df['B'] > 0
            df['int1'] = 1
            df['int2'] = 2
            return df.consolidate()

        df1 = _make_one()
        df2 = _make_one()

        self._check_roundtrip(df1, tm.assert_frame_equal)
        self._check_roundtrip(df2, tm.assert_frame_equal)

        self.store['obj'] = df1
        tm.assert_frame_equal(self.store['obj'], df1)
        self.store['obj'] = df2
        tm.assert_frame_equal(self.store['obj'], df2)

        # check that can store Series of all of these types
        self._check_roundtrip(df1['obj1'], tm.assert_series_equal)
        self._check_roundtrip(df1['bool1'], tm.assert_series_equal)
        self._check_roundtrip(df1['int1'], tm.assert_series_equal)

        # try with compression
        self._check_roundtrip(df1['obj1'],
                              tm.assert_series_equal,
                              compression=True)
        self._check_roundtrip(df1['bool1'],
                              tm.assert_series_equal,
                              compression=True)
        self._check_roundtrip(df1['int1'],
                              tm.assert_series_equal,
                              compression=True)
        self._check_roundtrip(df1, tm.assert_frame_equal, compression=True)

    def test_wide(self):
        wp = tm.makePanel()
        self._check_roundtrip(wp, tm.assert_panel_equal)

    def test_wide_table(self):
        wp = tm.makePanel()
        self._check_roundtrip_table(wp, tm.assert_panel_equal)

    def test_wide_table_dups(self):
        wp = tm.makePanel()
        try:
            store = HDFStore(self.scratchpath)
            store._quiet = True
            store.put('panel', wp, table=True)
            store.put('panel', wp, table=True, append=True)
            recons = store['panel']
            tm.assert_panel_equal(recons, wp)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_long(self):
        def _check(left, right):
            tm.assert_panel_equal(left.to_panel(), right.to_panel())

        wp = tm.makePanel()
        self._check_roundtrip(wp.to_frame(), _check)

        # empty
        # self._check_roundtrip(wp.to_frame()[:0], _check)

    def test_longpanel(self):
        pass

    def test_overwrite_node(self):
        self.store['a'] = tm.makeTimeDataFrame()
        ts = tm.makeTimeSeries()
        self.store['a'] = ts

        tm.assert_series_equal(self.store['a'], ts)

    def test_select(self):
        wp = tm.makePanel()

        # put/select ok
        self.store.remove('wp')
        self.store.put('wp', wp, table=True)
        self.store.select('wp')

        # non-table ok (where = None)
        self.store.remove('wp')
        self.store.put('wp2', wp, table=False)
        self.store.select('wp2')

        # selection on the non-indexable with a large number of columns
        wp = Panel(np.random.randn(100, 100, 100),
                   items=['Item%03d' % i for i in xrange(100)],
                   major_axis=date_range('1/1/2000', periods=100),
                   minor_axis=['E%03d' % i for i in xrange(100)])

        self.store.remove('wp')
        self.store.append('wp', wp)
        items = ['Item%03d' % i for i in xrange(80)]
        result = self.store.select('wp', Term('items', items))
        expected = wp.reindex(items=items)
        tm.assert_panel_equal(expected, result)

        # selectin non-table with a where
        #self.assertRaises(Exception, self.store.select,
        #                  'wp2', ('column', ['A', 'D']))

    def test_panel_select(self):
        wp = tm.makePanel()
        self.store.put('wp', wp, table=True)
        date = wp.major_axis[len(wp.major_axis) // 2]

        crit1 = ('major_axis', '>=', date)
        crit2 = ('minor_axis', '=', ['A', 'D'])

        result = self.store.select('wp', [crit1, crit2])
        expected = wp.truncate(before=date).reindex(minor=['A', 'D'])
        tm.assert_panel_equal(result, expected)

        result = self.store.select(
            'wp', ['major_axis>=20000124', ('minor_axis', '=', ['A', 'B'])])
        expected = wp.truncate(before='20000124').reindex(minor=['A', 'B'])
        tm.assert_panel_equal(result, expected)

    def test_frame_select(self):
        df = tm.makeTimeDataFrame()
        self.store.put('frame', df, table=True)
        date = df.index[len(df) // 2]

        crit1 = ('index', '>=', date)
        crit2 = ('columns', ['A', 'D'])
        crit3 = ('columns', 'A')

        result = self.store.select('frame', [crit1, crit2])
        expected = df.ix[date:, ['A', 'D']]
        tm.assert_frame_equal(result, expected)

        result = self.store.select('frame', [crit3])
        expected = df.ix[:, ['A']]
        tm.assert_frame_equal(result, expected)

        # other indicies for a frame

        # integer
        df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20)))
        self.store.append('df_int', df)
        self.store.select(
            'df_int',
            [Term("index<10"), Term("columns", "=", ["A"])])

        df = DataFrame(
            dict(A=np.random.rand(20),
                 B=np.random.rand(20),
                 index=np.arange(20, dtype='f8')))
        self.store.append('df_float', df)
        self.store.select('df_float',
                          [Term("index<10.0"),
                           Term("columns", "=", ["A"])])

        # can't select if not written as table
        #self.store['frame'] = df
        #self.assertRaises(Exception, self.store.select,
        #                  'frame', [crit1, crit2])

    def test_select_filter_corner(self):
        df = DataFrame(np.random.randn(50, 100))
        df.index = ['%.3d' % c for c in df.index]
        df.columns = ['%.3d' % c for c in df.columns]
        self.store.put('frame', df, table=True)

        crit = Term('columns', df.columns[:75])
        result = self.store.select('frame', [crit])
        tm.assert_frame_equal(result, df.ix[:, df.columns[:75]])

    def _check_roundtrip(self, obj, comparator, compression=False, **kwargs):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store['obj'] = obj
            retrieved = store['obj']
            comparator(retrieved, obj, **kwargs)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def _check_double_roundtrip(self,
                                obj,
                                comparator,
                                compression=False,
                                **kwargs):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store['obj'] = obj
            retrieved = store['obj']
            comparator(retrieved, obj, **kwargs)
            store['obj'] = retrieved
            again = store['obj']
            comparator(again, obj, **kwargs)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def _check_roundtrip_table(self, obj, comparator, compression=False):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store.put('obj', obj, table=True)
            retrieved = store['obj']
            # sorted_obj = _test_sort(obj)
            comparator(retrieved, obj)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_legacy_read(self):
        pth = curpath()
        store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r')
        store['a']
        store['b']
        store['c']
        store['d']
        store.close()

    def test_legacy_table_read(self):
        # legacy table types
        pth = curpath()
        store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r')
        store.select('df1')
        store.select('df2')
        store.select('wp1')

        # force the frame
        store.select('df2', typ='legacy_frame')

        # old version (this still throws an exception though)
        import warnings
        warnings.filterwarnings('ignore', category=IncompatibilityWarning)
        self.assertRaises(Exception, store.select, 'wp1',
                          Term('minor_axis', '=', 'B'))
        warnings.filterwarnings('always', category=IncompatibilityWarning)

        store.close()

    def test_legacy_table_write(self):
        # legacy table types
        pth = curpath()
        df = tm.makeDataFrame()
        wp = tm.makePanel()

        store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'a')

        self.assertRaises(Exception, store.append, 'df1', df)
        self.assertRaises(Exception, store.append, 'wp1', wp)

        store.close()

    def test_store_datetime_fractional_secs(self):
        dt = datetime(2012, 1, 2, 3, 4, 5, 123456)
        series = Series([0], [dt])
        self.store['a'] = series
        self.assertEquals(self.store['a'].index[0], dt)

    def test_tseries_indices_series(self):
        idx = tm.makeDateIndex(10)
        ser = Series(np.random.randn(len(idx)), idx)
        self.store['a'] = ser
        result = self.store['a']

        assert_series_equal(result, ser)
        self.assertEquals(type(result.index), type(ser.index))
        self.assertEquals(result.index.freq, ser.index.freq)

        idx = tm.makePeriodIndex(10)
        ser = Series(np.random.randn(len(idx)), idx)
        self.store['a'] = ser
        result = self.store['a']

        assert_series_equal(result, ser)
        self.assertEquals(type(result.index), type(ser.index))
        self.assertEquals(result.index.freq, ser.index.freq)

    def test_tseries_indices_frame(self):
        idx = tm.makeDateIndex(10)
        df = DataFrame(np.random.randn(len(idx), 3), index=idx)
        self.store['a'] = df
        result = self.store['a']

        assert_frame_equal(result, df)
        self.assertEquals(type(result.index), type(df.index))
        self.assertEquals(result.index.freq, df.index.freq)

        idx = tm.makePeriodIndex(10)
        df = DataFrame(np.random.randn(len(idx), 3), idx)
        self.store['a'] = df
        result = self.store['a']

        assert_frame_equal(result, df)
        self.assertEquals(type(result.index), type(df.index))
        self.assertEquals(result.index.freq, df.index.freq)

    def test_unicode_index(self):
        unicode_values = [u'\u03c3', u'\u03c3\u03c3']

        s = Series(np.random.randn(len(unicode_values)), unicode_values)
        self._check_roundtrip(s, tm.assert_series_equal)

    def test_store_datetime_mixed(self):
        df = DataFrame({
            'a': [1, 2, 3],
            'b': [1., 2., 3.],
            'c': ['a', 'b', 'c']
        })
        ts = tm.makeTimeSeries()
        df['d'] = ts.index[:3]
        self._check_roundtrip(df, tm.assert_frame_equal)

    def test_cant_write_multiindex_table(self):
        # for now, #1848
        df = DataFrame(
            np.random.randn(10, 4),
            index=[np.arange(5).repeat(2),
                   np.tile(np.arange(2), 5)])

        self.assertRaises(Exception, self.store.put, 'foo', df, table=True)
예제 #48
0
 def from_hdf5(cls, fname):
     history_store = HDFStore(fname)
     for attribute in cls._store_attributes:
         setattr(cls, attribute, history_store[attribute])
     history_store.close()
예제 #49
0
class TesttHDFStore(unittest.TestCase):
    path = '__test__.h5'
    scratchpath = '__scratch__.h5'

    def setUp(self):
        self.store = HDFStore(self.path)

    def tearDown(self):
        self.store.close()
        os.remove(self.path)

    def test_len_keys(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store['b'] = tm.makeStringSeries()
        self.store['c'] = tm.makeDataFrame()
        self.store['d'] = tm.makePanel()
        self.assertEquals(len(self.store), 4)
        self.assert_(set(self.store.keys()) == set(['a', 'b', 'c', 'd']))

    def test_repr(self):
        repr(self.store)
        self.store['a'] = tm.makeTimeSeries()
        self.store['b'] = tm.makeStringSeries()
        self.store['c'] = tm.makeDataFrame()
        self.store['d'] = tm.makePanel()
        repr(self.store)

    def test_reopen_handle(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store.open('w', warn=False)
        self.assert_(self.store.handle.isopen)
        self.assertEquals(len(self.store), 0)

    def test_flush(self):
        self.store['a'] = tm.makeTimeSeries()
        self.store.flush()

    def test_get(self):
        self.store['a'] = tm.makeTimeSeries()
        left = self.store.get('a')
        right = self.store['a']
        tm.assert_series_equal(left, right)

        self.assertRaises(AttributeError, self.store.get, 'b')

    def test_put(self):
        ts = tm.makeTimeSeries()
        df = tm.makeTimeDataFrame()
        self.store['a'] = ts
        self.store['b'] = df[:10]
        self.store.put('c', df[:10], table=True)

        # not OK, not a table
        self.assertRaises(ValueError, self.store.put, 'b', df[10:], append=True)

        # node does not currently exist, test _is_table_type returns False in
        # this case
        self.assertRaises(ValueError, self.store.put, 'f', df[10:], append=True)

        # OK
        self.store.put('c', df[10:], append=True)

        # overwrite table
        self.store.put('c', df[:10], table=True, append=False)
        tm.assert_frame_equal(df[:10], self.store['c'])

    def test_put_compression(self):
        df = tm.makeTimeDataFrame()

        self.store.put('c', df, table=True, compression='zlib')
        tm.assert_frame_equal(self.store['c'], df)

        # can't compress if table=False
        self.assertRaises(ValueError, self.store.put, 'b', df,
                          table=False, compression='zlib')

    def test_put_compression_blosc(self):
        tm.skip_if_no_package('tables', '2.2', app='blosc support')
        df = tm.makeTimeDataFrame()

        # can't compress if table=False
        self.assertRaises(ValueError, self.store.put, 'b', df,
                          table=False, compression='blosc')

        self.store.put('c', df, table=True, compression='blosc')
        tm.assert_frame_equal(self.store['c'], df)

    def test_put_integer(self):
        # non-date, non-string index
        df = DataFrame(np.random.randn(50, 100))
        self._check_roundtrip(df, tm.assert_frame_equal)

    def test_append(self):
        df = tm.makeTimeDataFrame()
        self.store.put('c', df[:10], table=True)
        self.store.append('c', df[10:])
        tm.assert_frame_equal(self.store['c'], df)

    def test_append_diff_item_order(self):
        wp = tm.makePanel()
        wp1 = wp.ix[:, :10, :]
        wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :]

        self.store.put('panel', wp1, table=True)
        self.assertRaises(Exception, self.store.put, 'panel', wp2,
                          append=True)

    def test_remove(self):
        ts = tm.makeTimeSeries()
        df = tm.makeDataFrame()
        self.store['a'] = ts
        self.store['b'] = df
        self.store.remove('a')
        self.assertEquals(len(self.store), 1)
        tm.assert_frame_equal(df, self.store['b'])

        self.store.remove('b')
        self.assertEquals(len(self.store), 0)

    def test_remove_where_not_exist(self):
        crit1 = {
            'field' : 'index',
            'op' : '>',
            'value' : 'foo'
        }
        self.store.remove('a', where=[crit1])

    def test_remove_crit(self):
        wp = tm.makePanel()
        self.store.put('wp', wp, table=True)
        date = wp.major_axis[len(wp.major_axis) // 2]

        crit1 = {
            'field' : 'index',
            'op' : '>',
            'value' : date
        }
        crit2 = {
            'field' : 'column',
            'value' : ['A', 'D']
        }
        self.store.remove('wp', where=[crit1])
        self.store.remove('wp', where=[crit2])
        result = self.store['wp']
        expected = wp.truncate(after=date).reindex(minor=['B', 'C'])
        tm.assert_panel_equal(result, expected)

    def test_series(self):
        s = tm.makeStringSeries()
        self._check_roundtrip(s, tm.assert_series_equal)

        ts = tm.makeTimeSeries()
        self._check_roundtrip(ts, tm.assert_series_equal)

    def test_float_index(self):
        # GH #454
        index = np.random.randn(10)
        s = Series(np.random.randn(10), index=index)
        self._check_roundtrip(s, tm.assert_series_equal)

    def test_tuple_index(self):
        # GH #492
        col = np.arange(10)
        idx = [(0.,1.), (2., 3.), (4., 5.)]
        data = np.random.randn(30).reshape((3, 10))
        DF = DataFrame(data, index=idx, columns=col)
        self._check_roundtrip(DF, tm.assert_frame_equal)

    def test_timeseries_preepoch(self):
        if sys.version_info[0] == 2 and sys.version_info[1] < 7:
            raise nose.SkipTest

        dr = DateRange('1/1/1940', '1/1/1960')
        ts = Series(np.random.randn(len(dr)), index=dr)
        try:
            self._check_roundtrip(ts, tm.assert_series_equal)
        except OverflowError:
            raise nose.SkipTest('known failer on some windows platforms')

    def test_frame(self):
        df = tm.makeDataFrame()

        # put in some random NAs
        df.values[0, 0] = np.nan
        df.values[5, 3] = np.nan

        self._check_roundtrip_table(df, tm.assert_frame_equal)
        self._check_roundtrip(df, tm.assert_frame_equal)

        self._check_roundtrip_table(df, tm.assert_frame_equal,
                                    compression=True)
        self._check_roundtrip(df, tm.assert_frame_equal,
                                    compression=True)

        tdf = tm.makeTimeDataFrame()
        self._check_roundtrip(tdf, tm.assert_frame_equal)
        self._check_roundtrip(tdf, tm.assert_frame_equal,
                              compression=True)

        # not consolidated
        df['foo'] = np.random.randn(len(df))
        self.store['df'] = df
        recons = self.store['df']
        self.assert_(recons._data.is_consolidated())

        # empty
        self.assertRaises(ValueError, self._check_roundtrip, df[:0],
                          tm.assert_frame_equal)

    def test_can_serialize_dates(self):
        rng = [x.date() for x in DateRange('1/1/2000', '1/30/2000')]
        frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
        self._check_roundtrip(frame, tm.assert_frame_equal)

    def test_store_hierarchical(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['foo', 'bar'])
        frame = DataFrame(np.random.randn(10, 3), index=index,
                          columns=['A', 'B', 'C'])

        self._check_roundtrip(frame, tm.assert_frame_equal)
        self._check_roundtrip(frame.T, tm.assert_frame_equal)
        self._check_roundtrip(frame['A'], tm.assert_series_equal)

        # check that the names are stored
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = frame
            recons = store['frame']
            assert(recons.index.names == ['foo', 'bar'])
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_index_name(self):
        df = tm.makeDataFrame()
        df.index.name = 'foo'
        try:
            store = HDFStore(self.scratchpath)
            store['frame'] = df
            recons = store['frame']
            assert(recons.index.name == 'foo')
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_series_name(self):
        df = tm.makeDataFrame()
        series = df['A']

        try:
            store = HDFStore(self.scratchpath)
            store['series'] = series
            recons = store['series']
            assert(recons.name == 'A')
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_store_mixed(self):
        def _make_one():
            df = tm.makeDataFrame()
            df['obj1'] = 'foo'
            df['obj2'] = 'bar'
            df['bool1'] = df['A'] > 0
            df['bool2'] = df['B'] > 0
            df['int1'] = 1
            df['int2'] = 2
            return df.consolidate()

        df1 = _make_one()
        df2 = _make_one()

        self._check_roundtrip(df1, tm.assert_frame_equal)
        self._check_roundtrip(df2, tm.assert_frame_equal)

        self.store['obj'] = df1
        tm.assert_frame_equal(self.store['obj'], df1)
        self.store['obj'] = df2
        tm.assert_frame_equal(self.store['obj'], df2)

        # storing in Table not yet supported
        self.assertRaises(Exception, self.store.put, 'foo',
                          df1, table=True)

        # check that can store Series of all of these types
        self._check_roundtrip(df1['obj1'], tm.assert_series_equal)
        self._check_roundtrip(df1['bool1'], tm.assert_series_equal)
        self._check_roundtrip(df1['int1'], tm.assert_series_equal)

        # try with compression
        self._check_roundtrip(df1['obj1'], tm.assert_series_equal,
                              compression=True)
        self._check_roundtrip(df1['bool1'], tm.assert_series_equal,
                              compression=True)
        self._check_roundtrip(df1['int1'], tm.assert_series_equal,
                              compression=True)
        self._check_roundtrip(df1, tm.assert_frame_equal,
                              compression=True)

    def test_wide(self):
        wp = tm.makePanel()
        self._check_roundtrip(wp, tm.assert_panel_equal)

    def test_wide_table(self):
        wp = tm.makePanel()
        self._check_roundtrip_table(wp, tm.assert_panel_equal)

    def test_wide_table_dups(self):
        wp = tm.makePanel()
        try:
            store = HDFStore(self.scratchpath)
            store._quiet = True
            store.put('panel', wp, table=True)
            store.put('panel', wp, table=True, append=True)
            recons = store['panel']
            tm.assert_panel_equal(recons, wp)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_long(self):
        def _check(left, right):
            tm.assert_panel_equal(left.to_panel(), right.to_panel())

        wp = tm.makePanel()
        self._check_roundtrip(wp.to_frame(), _check)

        # empty
        self.assertRaises(ValueError, self._check_roundtrip, wp.to_frame()[:0],
                          _check)

    def test_longpanel(self):
        pass

    def test_overwrite_node(self):
        self.store['a'] = tm.makeTimeDataFrame()
        ts = tm.makeTimeSeries()
        self.store['a'] = ts

        tm.assert_series_equal(self.store['a'], ts)

    def test_panel_select(self):
        wp = tm.makePanel()
        self.store.put('wp', wp, table=True)
        date = wp.major_axis[len(wp.major_axis) // 2]

        crit1 = {
            'field' : 'index',
            'op' : '>=',
            'value' : date
        }
        crit2 = {
            'field' : 'column',
            'value' : ['A', 'D']
        }

        result = self.store.select('wp', [crit1, crit2])
        expected = wp.truncate(before=date).reindex(minor=['A', 'D'])
        tm.assert_panel_equal(result, expected)

    def test_frame_select(self):
        df = tm.makeTimeDataFrame()
        self.store.put('frame', df, table=True)
        date = df.index[len(df) // 2]

        crit1 = {
            'field' : 'index',
            'op' : '>=',
            'value' : date
        }
        crit2 = {
            'field' : 'column',
            'value' : ['A', 'D']
        }
        crit3 = {
            'field' : 'column',
            'value' : 'A'
        }

        result = self.store.select('frame', [crit1, crit2])
        expected = df.ix[date:, ['A', 'D']]
        tm.assert_frame_equal(result, expected)

        result = self.store.select('frame', [crit3])
        expected = df.ix[:, ['A']]
        tm.assert_frame_equal(result, expected)

        # can't select if not written as table
        self.store['frame'] = df
        self.assertRaises(Exception, self.store.select,
                          'frame', [crit1, crit2])

    def test_select_filter_corner(self):
        df = DataFrame(np.random.randn(50, 100))
        df.index = ['%.3d' % c for c in df.index]
        df.columns = ['%.3d' % c for c in df.columns]
        self.store.put('frame', df, table=True)

        crit = {
            'field' : 'column',
            'value' : df.columns[:75]
        }
        result = self.store.select('frame', [crit])
        tm.assert_frame_equal(result, df.ix[:, df.columns[:75]])

    def _check_roundtrip(self, obj, comparator, compression=False):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store['obj'] = obj
            retrieved = store['obj']
            comparator(retrieved, obj)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def _check_roundtrip_table(self, obj, comparator, compression=False):
        options = {}
        if compression:
            options['complib'] = _default_compressor

        store = HDFStore(self.scratchpath, 'w', **options)
        try:
            store.put('obj', obj, table=True)
            retrieved = store['obj']
            sorted_obj = _test_sort(obj)
            comparator(retrieved, sorted_obj)
        finally:
            store.close()
            os.remove(self.scratchpath)

    def test_legacy_read(self):
        pth = curpath()
        store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r')
        store['a']
        store['b']
        store['c']
        store['d']
        store.close()

    def test_store_datetime_fractional_secs(self):
        dt = datetime(2012, 1, 2, 3, 4, 5, 123456)
        series = Series([0], [dt])
        self.store['a'] = series
        self.assertEquals(self.store['a'].index[0], dt)
예제 #50
0
def append_main_summary(newcols, storename_to_append, new_store_position_colname, new_store_glyglyseq_colname):
    """
    This functions only appends data into summary dataframe.
    Data - comliant columns to main dataframe but with
    sequences which are not present  in main dataframe

    :param newcols: column names in the main summary
    :param storename_to_append: store name to open and analyze
    :param new_store_position_colname: Position values column name in the new store
    :param new_store_glyglyseq_colname: Sequences values column name in the new store
    :return: appends main summary table with data, saves changes in the same dataframe in HDF store
    """
    # Open summary from hdf
    store = HDFStore('_data_/ProteinDataStore.h5')
    data_summary = store['DataBases_Summary']
    what_to_append = store[storename_to_append]

    # Get GlyGly values
    all_glygly_in_newstore = what_to_append[new_store_glyglyseq_colname].values
    all_glygly_in_summary = data_summary['GlyGly (K) Probabilities'].values

    # Find new sequences to add to the summary
    newcomer_seqs = []
    for x in all_glygly_in_newstore:
        clear_seq = clear_sequence(x)
        if clear_seq not in all_glygly_in_summary:
            newcomer_seqs.append(clear_seq)

    print len(newcomer_seqs), ' new sequences were found in ', storename_to_append

    # Clean them as well
    clean_newcomer_seqs = map(lambda x: re.sub(r'[^A-Z]', '', x), newcomer_seqs)

    # Find comliant positions
    subset_index = what_to_append[new_store_glyglyseq_colname].isin(
        newcomer_seqs)  # fetch subset where newcomers presents
    positions = what_to_append[subset_index][new_store_position_colname].values

    # BlastP query results
    blastpID_HUMAN, blastpID_RODENTS = fetch_indentity_from_local_batch(clean_newcomer_seqs)

    # convert to pandas series
    positions = pd.Series(positions)
    newcomer_seqs = pd.Series(newcomer_seqs)
    clean_newcomer_seqs = pd.Series(clean_newcomer_seqs)
    blastpID_HUMAN = pd.Series(blastpID_HUMAN)
    blastpID_RODENTS = pd.Series(blastpID_RODENTS)

    # Create empty dataframe to be appended
    data_summary_appendix = pd.DataFrame(columns=newcols)

    # Combine everything required in dataframe
    data_summary_appendix['Position'] = positions
    data_summary_appendix['GlyGly (K) Probabilities'] = newcomer_seqs
    data_summary_appendix['GlyGly Probabilities'] = clean_newcomer_seqs
    data_summary_appendix['SP_ID_BLASTP_HUMAN'] = blastpID_HUMAN
    data_summary_appendix['SP_ID_BLASTP_RODENTS'] = blastpID_RODENTS

    # Append main DataBases_Summary
    data_summary = data_summary.append(data_summary_appendix)

    # Save to HDF store
    store['DataBases_Summary'] = data_summary
    store.close()
예제 #51
0
    def downloadCSV(self, startdate, waiting_time, download_dir,
                    DaysTillStore):
        chrome_options = webdriver.ChromeOptions()
        preferences = {
            "download.default_directory": download_dir,
            "directory_upgrade": True,
            "safebrowsing.enabled": True,
            "extensions_to_open": ""
        }
        chrome_options.add_experimental_option("prefs", preferences)
        driver = webdriver.Chrome(chrome_options=chrome_options)
        patentfile = self.cwd + '\\PatentScraper\\patents.h5'
        Patent_info = pd.DataFrame(data=None,
                                   columns=[
                                       "id", "title", "assignee",
                                       "inventor/author", "priority date",
                                       "filing/creation date",
                                       "publication date", "grant date",
                                       "result link"
                                   ])

        for i in range(100000):
            time_delta_before = datetime.timedelta(days=i)
            time_delta_after = datetime.timedelta(days=i + 1)
            Date_before = startdate - time_delta_before
            Date_before = Date_before.strftime("%Y%m%d")
            Date_after = startdate - time_delta_after
            Date_after = Date_after.strftime("%Y%m%d")

            # Das lass ich so drin, weil UNSERE Ladegeschwindigkeit nicht der einschränkende Faktor ist
            'check internet connection; only continue if connection is given'
            connection = 0
            while connection < 1:
                try:
                    urllib.request.urlopen('http://www.python.org/')
                    #return True
                    connection = 1
                except:
                    connection = 0
                    time.sleep(3)
                    print('no connection')
                    continue
            driver_connection = 0
            'check if webdriver is still running; only continue if functionality is given'
            while driver_connection < 1:
                try:
                    driver.get('file:///C:/')
                    driver_connection = 1
                except:
                    driver_connection = 0
                    time.sleep(3)
                    print('driver crashed')
                    driver = webdriver.Chrome()
                    continue

            url = 'https://patents.google.com/xhr/query?url=before%3Dfiling%3A' + str(
                Date_before) + '%26after%3Dfiling%3A' + str(
                    Date_after) + '&exp=&download=true'
            print(i, Date_before)
            driver.get(url)

            randNR = 1 + numpy.matlib.rand(1, 1) * waiting_time
            time.sleep(randNR)

            filename = max(
                [download_dir + "\\" + f for f in os.listdir(download_dir)],
                key=os.path.getctime)
            shutil.move(os.path.join(download_dir, filename),
                        str(Date_before) + "_filing.csv")

            Data = pd.read_csv(str(Date_before) + "_filing.csv",
                               skiprows=(1),
                               header=(0))
            id = pd.DataFrame(Data, columns=["id"])
            title = pd.DataFrame(Data, columns=["title"])
            assignee = pd.DataFrame(Data, columns=["assignee"])
            inventor_author = pd.DataFrame(Data, columns=["inventor/author"])
            priority_date = pd.DataFrame(Data, columns=["priority date"])
            filing_creation_date = pd.DataFrame(
                Data, columns=["filing/creation date"])
            publication_date = pd.DataFrame(Data, columns=["publication date"])
            grant_date = pd.DataFrame(Data, columns=["grant date"])
            result_link = pd.DataFrame(Data, columns=["result link"])
            Current_Patent_Content = np.concatenate(
                (id, title, assignee, inventor_author, priority_date,
                 filing_creation_date, publication_date, grant_date,
                 result_link),
                axis=1)
            df = pd.DataFrame(Current_Patent_Content,
                              columns=[
                                  "id", "title", "assignee", "inventor/author",
                                  "priority date", "filing/creation date",
                                  "publication date", "grant date",
                                  "result link"
                              ])

            Patent_info = Patent_info.append(df)

            if i == math.trunc(i / DaysTillStore) * DaysTillStore:

                store = HDFStore(patentfile, complevel=4)

                Patent_info = Patent_info[[
                    "id", "title", "assignee", "inventor/author",
                    "priority date", "filing/creation date",
                    "publication date", "grant date", "result link"
                ]]

                print("cumm", Patent_info.shape)

                try:
                    Patent_info_store = store['Patent_info']
                    Patent_info_store = Patent_info_store.append(Patent_info)
                except KeyError:
                    Patent_info_store = Patent_info

                print(Patent_info_store.shape)

                store['Patent_info'] = Patent_info_store

                # compress file... otherwise it will by 100ds of GB large - Compressed already in store without command file

                #                 store.close()
                #                 outfilename =  self.cwd +'\\PatentScraper\\out.h5'
                #                 command = ["ptrepack", "-o", "--chunkshape=auto", "--propindexes", patentfile, outfilename]
                #                 print('Size of %s is %.2fMB' % (patentfile, float(os.stat(patentfile).st_size)/1024**2))
                #                 if call(command) != 0:
                #                     print('Error')
                #                 else:
                #                     print('Size of %s is %.2fMB' % (outfilename, float(os.stat(outfilename).st_size)/1024**2))
                #                 os.remove(patentfile)
                #                 os.renames(outfilename, patentfile)
                store.close()
                print(
                    'Size of %s is %.2fMB' %
                    (patentfile, float(os.stat(patentfile).st_size) / 1024**2))
                # Reset DataFrame
                Patent_info = pd.DataFrame(
                    data=None,
                    columns=[
                        "id", "title", "assignee", "inventor/author",
                        "priority date", "filing/creation date",
                        "publication date", "grant date", "result link"
                    ])

        print("done")

        driver.close()
예제 #52
0
print "Loading ABC"
reader = StataReader(paths.abc)
abcd = reader.read(convert_dates=False, convert_categoricals=False)
abcd.id.fillna(9999, inplace=True)
abcd = abcd.set_index('id')
abcd.drop(abcd.loc[(abcd.RV == 1) & (abcd.R == 0)].index, inplace=True)

inc = abcd.filter(regex='^inc_labor[0-9][0-9]')
along = pd.wide_to_long(abcd[inc.columns].reset_index(), ['inc_labor'],
                        i='id',
                        j='age').sort_index()
along = along.interpolate(limit=1)
awide = along.unstack()
awide.columns = awide.columns.droplevel(0)
awide.columns = ['{}{}'.format('inc_labor', a) for a in awide.columns]
abcd[awide.columns] = awide

abcd = abcd.loc[:, unique_list(['R'] + cols.interpABC.keep)]
print abcd
print "Storing Datasets in HDF5 Format"

datasets = [('psid-labor', psid), ('nlsy-labor', nlsy),
            ('extrap-labor', extrap), ('abc-mini', abcd)]

store = HDFStore(os.path.join(paths.data, 'data.h5'))

for name, d in datasets:
    d.to_hdf(os.path.join(paths.data, 'data.h5'), key=name)

store.close()