def to_hdf5(self, fname, complevel=9, complib='bzip2'): if os.path.exists(fname): logger.warning('Overwrite %s with current history', fname) history_store = HDFStore(fname, mode='w', complevel=complevel, complib=complib) for attribute in self._store_attributes: history_store[attribute] = getattr(self, attribute) history_store.close()
def test_append(self): pth = '__test_append__.h5' try: store = HDFStore(pth) df = tm.makeTimeDataFrame() store.append('df1', df[:10]) store.append('df1', df[10:]) tm.assert_frame_equal(store['df1'], df) store.put('df2', df[:10], table=True) store.append('df2', df[10:]) tm.assert_frame_equal(store['df2'], df) wp = tm.makePanel() store.append('wp1', wp.ix[:,:10,:]) store.append('wp1', wp.ix[:,10:,:]) tm.assert_panel_equal(store['wp1'], wp) except: raise finally: store.close() os.remove(pth)
def quantitative_analysis(df_name, df_seq_col, df_quant_col, func=lambda x: x): print "Quantitative analysis of ", df_name store = HDFStore('_data_/ProteinDataStore.h5') summary = store['DataBases_Summary'] df = store[df_name] df = df[[df_seq_col, df_quant_col]] renamed_col = '_'.join(df_quant_col.split(' ')) print "Filling column ", renamed_col summary[renamed_col] = ['.'] * len(summary) print "Current summary shape: ", summary.shape seq_list = map(lambda x: re.sub(r'[^A-Z]', '', x), df[df_seq_col].values) for i in zip(seq_list, df[df_quant_col].values): query = np.where(summary['GlyGly Probabilities'] == i[0])[0] if len(query) != 0: index = query[0] else: print "Omitted data: ", i continue if not np.isnan(i[1]): try: tmp = func(i[1]) summary.loc[index, renamed_col] = tmp except Exception as e: print i print e.message else: summary.loc[index, renamed_col] = '.' store['DataBases_Summary'] = summary store.close()
def update_exchanges(): """ Updates data for exchanges such as NYSE """ ####### LOAD DATE RANGES AND SYMBOLS start_date = Config(CFG).get('Exchange Data Start Date', 'default_start_date') end_date = datetime.datetime.now().strftime('%Y-%m-%d') symbols = [Config(CFG).get('Symbol List', 'list')] ####### BACKUP and UPDATE DB filename = Config(CFG).get("DB Locations", 'exchange_data') backup = Config(CFG).get("DB Locations", 'exchange_data_backup') file_update_backup(filename, backup) ####### START HDF5 INSTANCE operator = HDFStore(filename) for symbol in symbols: ####### PULL YAHOO FINANCE DATA data = get_daily_history(symbol, start_date, end_date) ####### PULL ADVANCES/DECLINES DATA data = data.merge(update_unicorn(symbol), left_index=True, right_index=True, how='outer') ####### SAVE DATA TO HDF5 operator[symbol] = data operator.close()
def hdf(): df = ts.get_hist_data('000875') # df.to_hdf('c:/day/store.h5','table') store = HDFStore('c:/day/store.h5') store['000875'] = df store.close()
def drop_with_low_probability(storename, df_name, loc_probability_colname, threshold=0.95): print 'Filtering by low probability in', df_name store = HDFStore(storename) df = store[df_name] if loc_probability_colname is not None: df = df[df[loc_probability_colname] >= threshold] store[df_name] = df store.close()
def test_legacy_table_read(self): # legacy table types pth = curpath() store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r') store.select('df1') store.select('df2') store.select('wp1') store.close()
def reindex_summary(): store = HDFStore('_data_/ProteinDataStore.h5') data_summary = store['DataBases_Summary'] range_index = [x for x in np.arange(len(data_summary))] print "Reindexing..." data_summary = data_summary.set_index([range_index]) store['DataBases_Summary'] = data_summary store.close()
def test_legacy_read(self): pth = curpath() store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r') store['a'] store['b'] store['c'] store['d'] store.close()
def _check_roundtrip(self, obj, comparator): store = HDFStore(self.scratchpath, 'w') try: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj) finally: store.close() os.remove(self.scratchpath)
def colorful_dump_summary_to_excel(output_filename, range_label='L1:U36229'): # < -2 dark green # -2 to -1 light green # -1 to 1 yellow # 1 to 2 Orange # > 2 red store = HDFStore('_data_/ProteinDataStore.h5') data_summary = store['DataBases_Summary'] writer = ExcelWriter(output_filename + '.xlsx', engine='xlsxwriter') data_summary.to_excel(writer, 'DataBases_Summary', index=True) workbook = writer.book worksheet = writer.sheets['DataBases_Summary'] # using pallete http://www.colourlovers.com/palette/3687876/ blue = workbook.add_format({'bg_color': '#69D2E7', 'font_color': '#000000'}) coral = workbook.add_format({'bg_color': '#A7DBD8', 'font_color': '#000000'}) yellow = workbook.add_format({'bg_color': '#EAE319', 'font_color': '#000000'}) orange = workbook.add_format({'bg_color': '#FA6900', 'font_color': '#000000'}) red = workbook.add_format({'bg_color': '#E2434B', 'font_color': '#000000'}) # empty = workbook.add_format({'bg_color': '#FFFFFF', 'font_color': '#000000'}) # # worksheet.conditional_format(range_label, {'type': 'text', # 'criteria': 'begins with', # 'value': '.', # 'format': empty}) worksheet.conditional_format(range_label, {'type': 'cell', 'criteria': '<', 'value': -2, 'format': blue}) worksheet.conditional_format(range_label, {'type': 'cell', 'criteria': 'between', 'minimum': -2, 'maximum': -1, 'format': coral}) worksheet.conditional_format(range_label, {'type': 'cell', 'criteria': 'between', 'minimum': -1, 'maximum': 1, 'format': yellow}) worksheet.conditional_format(range_label, {'type': 'cell', 'criteria': 'between', 'minimum': 1, 'maximum': 2, 'format': orange}) worksheet.conditional_format(range_label, {'type': 'cell', 'criteria': '>', 'value': 2, 'format': red}) writer.save() store.close()
def _check_roundtrip_table(self, obj, comparator): store = HDFStore(self.scratchpath, 'w') try: store.put('obj', obj, table=True) retrieved = store['obj'] sorted_obj = _test_sort(obj) comparator(retrieved, sorted_obj) finally: store.close() os.remove(self.scratchpath)
def parse_one_and_save(input_file, output_store_name): sheet_name = 'All sites' skip_rows = [0] store = HDFStore(output_store_name) df = pd.ExcelFile(input_file).parse(sheetname=sheet_name, skiprows=skip_rows) name = (input_file.split('/')[1]).split('.')[0] print "Parsing ", name store[name] = df store.close()
def load(self, format='csv'): savefile = self.__savefile() if format == "csv": self.frame.from_csv(savefile + ".csv") elif format == "hdf": store = HDFStore(savefile + ".hdf") try: self.frame = store['data'] finally: store.close()
def load_exchange_data(symbol): """ Returns data for a specific exchange """ filename = Config(CFG).get("DB Locations", 'exchange_data') operator = HDFStore(filename) data = operator[symbol] operator.close() return data
def parse_list_and_save(list_of_files, output_store_name): sheet_name = 'All sites' skip_rows = [0] store = HDFStore(output_store_name) for _file_ in list_of_files: df = pd.ExcelFile(_file_).parse(sheetname=sheet_name, skiprows=skip_rows) name = (_file_.split('/')[2]).split('.')[0] print "Parsing ", name store[name] = df store.close()
def test_store_index_name(self): df = tm.makeDataFrame() df.index.name = 'foo' try: store = HDFStore(self.scratchpath) store['frame'] = df recons = store['frame'] assert(recons.index.name == 'foo') finally: store.close() os.remove(self.scratchpath)
def test_store_series_name(self): df = tm.makeDataFrame() series = df['A'] try: store = HDFStore(self.scratchpath) store['series'] = series recons = store['series'] assert(recons.name == 'A') finally: store.close() os.remove(self.scratchpath)
def test_wide_table_dups(self): wp = tm.makePanel() try: store = HDFStore(self.scratchpath) store._quiet = True store.put('panel', wp, table=True) store.put('panel', wp, table=True, append=True) recons = store['panel'] tm.assert_panel_equal(recons, wp) finally: store.close() os.remove(self.scratchpath)
def test_legacy_table_write(self): # legacy table types pth = curpath() df = tm.makeDataFrame() wp = tm.makePanel() store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'a') self.assertRaises(Exception, store.append, 'df1', df) self.assertRaises(Exception, store.append, 'wp1', wp) store.close()
def test_timezones(self): rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') frame = DataFrame(np.random.randn(len(rng), 4), index=rng) try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] self.assert_(recons.index.equals(rng)) self.assertEquals(rng.tz, recons.index.tz) finally: store.close() os.remove(self.scratchpath)
def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj, **kwargs) finally: store.close() os.remove(self.scratchpath)
def hload(self,fname): from pandas.io.pytables import HDFStore store = HDFStore(fname,mode='r') self.clear() read = [] for k in store.keys(): if re.match('^_MISSING',k): v = store.get(k).to_dict().values() self._missing = v continue name = re.sub('^/','',k) self[name]=store[k] read.append(name) store.close()
def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store.put('obj', obj, table=True) retrieved = store['obj'] sorted_obj = _test_sort(obj) comparator(retrieved, sorted_obj) finally: store.close() os.remove(self.scratchpath)
def hsave(self,fname): from pandas.io.pytables import HDFStore LOGGER.debug('Saving HDF in %s\n%s',fname,self.report()) store = HDFStore(fname,mode='w') for k,v in self.items(): if re.match('^__',k): continue if isinstance(v,np.ndarray): v = Series(v) LOGGER.debug('Saving HDF for %s',k) store.put(k,v) if self._missing: store['_MISSING']=Series(self._missing) store.close()
def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store.put('obj', obj, table=True) retrieved = store['obj'] sorted_obj = _test_sort(obj) comparator(retrieved, sorted_obj) finally: store.close() os.remove(self.scratchpath)
def load_historical_data(start=datetime(2010, 1, 1), end=datetime.today(), filename='stock_data.h5'): store = HDFStore(filename) with open('companylist.csv', 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') for row in spamreader: print row[0] try: stock_info = web.DataReader(row[0], "yahoo", start, end) store[row[0]] = stock_info except: print "Error on", row[0] store.close()
def hdfWrite(self, path, excode, symbol, indata, kind1, kind2, kind3): # kind1为 'Rawdata'、'Stitch'、'Indicator' # kind2为 '00' '01' # kind3为 '1d' '60m' '30m' '15m' '5m' '1m' # 写各个频率的Rawdata: kind1='Rawdata',kind2=None,kind3='1d' # 写StitchRule: kind1='Stitch', kind2='00',kind3=None # 写StitchData: kind1='Stitch', kind2='00',kind3='1d' # 写Indicator: kind1='Indicator',kind2='Indicator_name',kind3='params' store = HDFStore(path, mode='a') if kind1 == EXT_Rawdata: key = kind1 + '/' + excode + '/' + symbol + '/' + kind3 elif kind1 == EXT_Stitch: key = kind1 + '/' + excode + '/' + symbol + '/' + EXT_Rule + '/' + kind2 if kind3 == None else kind1 + '/' + excode + '/' + symbol + '/' + EXT_Period + '/' + kind3 + '/' + kind2 elif kind1 == EXT_Indicator: key = kind1 + '/' + excode + '/' + symbol + kind2 else: print("kind not supported") return if kind1 == EXT_Indicator: f = h5py.File(path, 'a') try: store[key] except KeyError: # 路径不存在时创建 store[key] = indata f[key].attrs['Params'] = kind3 else: if f[key].attrs['Params'] == kind3: #Params匹配时合并 adddata = indata[~indata.index.isin(store[key].index)] store.append(key, adddata) else: # Params不匹配时覆盖 store[key] = indata f[key].attrs['Params'] = kind3 f.close() store.close() else: try: store[key] except KeyError: store[key] = indata else: adddata = indata[~indata.index.isin(store[key].index)] if kind2 in [EXT_Series_00, EXT_Series_01]: adddata[EXT_Out_AdjFactor] = adddata[ EXT_Out_AdjFactor] * store[key][EXT_Out_AdjFactor].iloc[ -1] / adddata[EXT_Out_AdjFactor].iloc[0] store.append(key, adddata) store.close()
def read_archive(hdf_path, items=['train_x', 'valid_x', 'test_x', 'train_y', 'valid_y', 'test_y']): ''' convenience function used for retrieving data within a hdf archive Args: hdf_path (str): fullpath of file which data is stored in items opt(list): items to be retrieved default: ['train_x', 'valid_x', 'test_x', 'train_y', 'valid_y', 'test_y'] ''' hdf = HDFStore(hdf_path) output = map(lambda x: hdf[x], items) hdf.close() return output
def make_summary(newcols): """ :param newcols: column names in the main summary table :return: none """ print "Making summary..." # open store end read base dataframe store = HDFStore('_data_/ProteinDataStore.h5') df1 = store['Mol_Cell_Proteomics_2011_Epub_2011_September1Supp2'] # clean sequences LEN = len(df1) positions = [0] * LEN real_glygly = [0] * LEN clean_glygly = [0] * LEN for i in np.arange(LEN): positions[i] = df1['Position'].values[i] real_glygly[i] = clear_sequence(df1['GlyGly (K) Probabilities'].values[i]) clean_glygly[i] = re.sub(r'[^A-Z]', '', real_glygly[i]) # align with SwissProt Human and Rodents using blastp blastpID_HUMAN, blastpID_RODENTS = fetch_indentity_from_local_batch(clean_glygly) del df1 print "Length test", len(positions) == len(real_glygly) == len(clean_glygly) == len(blastpID_HUMAN) == len( blastpID_RODENTS) # convert to pandas series clean_glygly = pd.Series(clean_glygly) blastpID_HUMAN = pd.Series(blastpID_HUMAN) blastpID_RODENTS = pd.Series(blastpID_RODENTS) # Create empty dataframe data_summary = pd.DataFrame(columns=newcols) # Combine everything required in dataframe data_summary['Position'] = positions data_summary['GlyGly (K) Probabilities'] = real_glygly data_summary['GlyGly Probabilities'] = clean_glygly data_summary['SP_ID_BLASTP_HUMAN'] = blastpID_HUMAN data_summary['SP_ID_BLASTP_RODENTS'] = blastpID_RODENTS # Save to HDF store store['DataBases_Summary'] = data_summary store.close()
def _check_double_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj, **kwargs) store['obj'] = retrieved again = store['obj'] comparator(again, obj, **kwargs) finally: store.close() os.remove(self.scratchpath)
def hdfRead(self, path, excode, symbol, kind1, kind2, kind3, startdate=EXT_Start, enddate=EXT_End, is_stitch=True): # kind1为 'Rawdata',Stitch','Indicator' # kind2为 '00' '01' # kind3为 '1d' '60m' '30m' '15m' '5m' '1m' # 读各个频率的Rawdata: kind1='Rawdata',kind2=None,kind3='1d' # 读StitchRule: kind1='Stitch', kind2='00',kind3=None # 读STitchData: kind1='Stitch', kind2='00',kind3='1d' # 读Indicator: kind1='Indicator',kind2='Indicator_name',kind3=None store = HDFStore(path, mode='r') if kind1 == EXT_Rawdata: key = '/'.join([kind1, excode, symbol, kind3]) elif kind1 == EXT_Stitch: key = '/'.join([kind1, excode, symbol, EXT_Rule, kind2 ]) if kind3 == None else '/'.join([ kind1, excode, symbol, EXT_Period, kind3, kind2 ]) elif kind1 == EXT_Indicator: key = '/'.join([kind1, excode, symbol, kind2]) else: print("kind not supported") return data = store[key].ix[( (store[key].index.get_level_values(0) >= pd.to_datetime(startdate)) & (store[key].index.get_level_values(0) <= pd.to_datetime(enddate)) ), :] if kind1 == EXT_Stitch and is_stitch == True and kind3 != None: data[EXT_Bar_Open] = data[EXT_AdjFactor] * data[EXT_Bar_Open] data[EXT_Bar_High] = data[EXT_AdjFactor] * data[EXT_Bar_High] data[EXT_Bar_Low] = data[EXT_AdjFactor] * data[EXT_Bar_Low] data[EXT_Bar_Close] = data[EXT_AdjFactor] * data[EXT_Bar_Close] store.close() if kind1 == EXT_Indicator: f = h5py.File(path, 'r') params = f[key].attrs['Params'] f.close() return data, params return data
def test_big_table(self): raise nose.SkipTest('no big table') # create and write a big table wp = Panel(np.random.randn(20, 1000, 1000), items= [ 'Item%s' % i for i in xrange(20) ], major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%s' % i for i in xrange(1000) ]) wp.ix[:,100:200,300:400] = np.nan try: store = HDFStore(self.scratchpath) store._debug_memory = True store.append('wp',wp) recons = store.select('wp') finally: store.close() os.remove(self.scratchpath)
def test_append(self): pth = '__test_append__.h5' try: store = HDFStore(pth) df = tm.makeTimeDataFrame() store.append('df1', df[:10]) store.append('df1', df[10:]) tm.assert_frame_equal(store['df1'], df) store.put('df2', df[:10], table=True) store.append('df2', df[10:]) tm.assert_frame_equal(store['df2'], df) store.append('/df3', df[:10]) store.append('/df3', df[10:]) tm.assert_frame_equal(store['df3'], df) # this is allowed by almost always don't want to do it import warnings import tables warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) store.append('/df3 foo', df[:10]) store.append('/df3 foo', df[10:]) tm.assert_frame_equal(store['df3 foo'], df) warnings.filterwarnings('always', category=tables.NaturalNameWarning) # panel wp = tm.makePanel() store.append('wp1', wp.ix[:,:10,:]) store.append('wp1', wp.ix[:,10:,:]) tm.assert_panel_equal(store['wp1'], wp) # ndim p4d = tm.makePanel4D() store.append('p4d', p4d.ix[:,:,:10,:]) store.append('p4d', p4d.ix[:,:,10:,:]) tm.assert_panel4d_equal(store['p4d'], p4d) except: raise finally: store.close() os.remove(pth)
def test_legacy_table_read(self): # legacy table types pth = curpath() store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r') store.select('df1') store.select('df2') store.select('wp1') # force the frame store.select('df2', typ='legacy_frame') # old version (this still throws an exception though) import warnings warnings.filterwarnings('ignore', category=IncompatibilityWarning) self.assertRaises(Exception, store.select, 'wp1', Term('minor_axis', '=', 'B')) warnings.filterwarnings('always', category=IncompatibilityWarning) store.close()
def test_big_table(self): raise nose.SkipTest('no big table') # create and write a big table wp = Panel(np.random.randn(20, 1000, 1000), items=['Item%s' % i for i in xrange(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=['E%s' % i for i in xrange(1000)]) wp.ix[:, 100:200, 300:400] = np.nan try: store = HDFStore(self.scratchpath) store._debug_memory = True store.append('wp', wp) recons = store.select('wp') finally: store.close() os.remove(self.scratchpath)
def analyze_existence(storename_to_append, gly_gly_seq_colname): print "Analyzing occurence in ", storename_to_append store = HDFStore('_data_/ProteinDataStore.h5') data_summary = store['DataBases_Summary'] tmp_store_sequences = store[storename_to_append][gly_gly_seq_colname].values tmp_store_sequences = map(clear_sequence, tmp_store_sequences) # Make binary vector which represents existence # of the sequence in storename_to_append dataset existense_index = data_summary['GlyGly (K) Probabilities'].isin(tmp_store_sequences).values existense_index = np.asarray(existense_index, dtype=int) # Create new column in summary table data_summary[storename_to_append] = existense_index print np.sum(data_summary[storename_to_append]) # Save to HDF store store['DataBases_Summary'] = data_summary store.close()
def test_append(self): pth = '__test_append__.h5' try: store = HDFStore(pth) df = tm.makeTimeDataFrame() store.append('df1', df[:10]) store.append('df1', df[10:]) tm.assert_frame_equal(store['df1'], df) store.put('df2', df[:10], table=True) store.append('df2', df[10:]) tm.assert_frame_equal(store['df2'], df) store.append('/df3', df[:10]) store.append('/df3', df[10:]) tm.assert_frame_equal(store['df3'], df) # this is allowed by almost always don't want to do it import warnings import tables warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) store.append('/df3 foo', df[:10]) store.append('/df3 foo', df[10:]) tm.assert_frame_equal(store['df3 foo'], df) warnings.filterwarnings('always', category=tables.NaturalNameWarning) wp = tm.makePanel() store.append('wp1', wp.ix[:, :10, :]) store.append('wp1', wp.ix[:, 10:, :]) tm.assert_panel_equal(store['wp1'], wp) except: raise finally: store.close() os.remove(pth)
def test_store_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo', 'bar']) frame = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) self._check_roundtrip(frame, tm.assert_frame_equal) self._check_roundtrip(frame.T, tm.assert_frame_equal) self._check_roundtrip(frame['A'], tm.assert_series_equal) # check that the names are stored try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] assert(recons.index.names == ['foo', 'bar']) finally: store.close() os.remove(self.scratchpath)
def hdfRead(self, path, excode, symbol, kind1, kind2, kind3, startdate=EXT_Start, enddate=EXT_End): # kind1为 'Rawdata',Stitch','Indicator' # kind2为 '00' '01' # kind3为 '1d' '60m' '30m' '15m' '5m' '1m' # 读各个频率的Rawdata: kind1='Rawdata',kind2=None,kind3='1d' # 读StitchRule: kind1='Stitch', kind2='00',kind3=None # 读STitchData: kind1='Stitch', kind2='00',kind3='1d' # 读Indicator: kind1='Indicator',kind2='Indicator_name',kind3=None store = HDFStore(path, mode='r') if kind1 == EXT_Rawdata: key = kind1 + '/' + excode + '/' + symbol + '/' + kind3 elif kind1 == EXT_Stitch: key = kind1 + '/' + excode + '/' + symbol + '/' + EXT_Rule + '/' + kind2 if kind3 == None else kind1 + '/' + excode + '/' + symbol + '/' + EXT_Period + '/' + kind3 + '/' + kind2 elif kind1 == EXT_Indicator: key = kind1 + '/' + excode + '/' + symbol + kind2 else: print("kind not supported") return data = store[key].ix[( (store[key].index.get_level_values(0) >= pd.to_datetime(startdate)) & (store[key].index.get_level_values(0) <= pd.to_datetime(enddate)) ), :] store.close() if kind1 == EXT_Indicator: f = h5py.File(path, 'r') params = f[key].attrs['Params'] f.close() return data, params return data
class TestHDFStore(unittest.TestCase): path = '__test__.h5' scratchpath = '__scratch__.h5' def setUp(self): self.store = HDFStore(self.path) def tearDown(self): self.store.close() os.remove(self.path) def test_factory_fun(self): try: with get_store(self.scratchpath) as tbl: raise ValueError('blah') except ValueError: pass with get_store(self.scratchpath) as tbl: tbl['a'] = tm.makeDataFrame() with get_store(self.scratchpath) as tbl: self.assertEquals(len(tbl), 1) self.assertEquals(type(tbl['a']), DataFrame) os.remove(self.scratchpath) def test_keys(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() self.store['foo/bar'] = tm.makePanel() self.assertEquals(len(self.store), 5) self.assert_( set(self.store.keys()) == set(['/a', '/b', '/c', '/d', '/foo/bar' ])) def test_repr(self): repr(self.store) self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() self.store['foo/bar'] = tm.makePanel() self.store.append('e', tm.makePanel()) repr(self.store) str(self.store) def test_contains(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeDataFrame() self.store['foo/bar'] = tm.makeDataFrame() self.assert_('a' in self.store) self.assert_('b' in self.store) self.assert_('c' not in self.store) self.assert_('foo/bar' in self.store) self.assert_('/foo/bar' in self.store) self.assert_('/foo/b' not in self.store) self.assert_('bar' not in self.store) def test_reopen_handle(self): self.store['a'] = tm.makeTimeSeries() self.store.open('w', warn=False) self.assert_(self.store.handle.isopen) self.assertEquals(len(self.store), 0) def test_flush(self): self.store['a'] = tm.makeTimeSeries() self.store.flush() def test_get(self): self.store['a'] = tm.makeTimeSeries() left = self.store.get('a') right = self.store['a'] tm.assert_series_equal(left, right) left = self.store.get('/a') right = self.store['/a'] tm.assert_series_equal(left, right) self.assertRaises(KeyError, self.store.get, 'b') def test_put(self): ts = tm.makeTimeSeries() df = tm.makeTimeDataFrame() self.store['a'] = ts self.store['b'] = df[:10] self.store['foo/bar/bah'] = df[:10] self.store['foo'] = df[:10] self.store['/foo'] = df[:10] self.store.put('c', df[:10], table=True) # not OK, not a table self.assertRaises(ValueError, self.store.put, 'b', df[10:], append=True) # node does not currently exist, test _is_table_type returns False in # this case self.assertRaises(ValueError, self.store.put, 'f', df[10:], append=True) # OK self.store.put('c', df[10:], append=True) # overwrite table self.store.put('c', df[:10], table=True, append=False) tm.assert_frame_equal(df[:10], self.store['c']) def test_put_compression(self): df = tm.makeTimeDataFrame() self.store.put('c', df, table=True, compression='zlib') tm.assert_frame_equal(self.store['c'], df) # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, table=False, compression='zlib') def test_put_compression_blosc(self): tm.skip_if_no_package('tables', '2.2', app='blosc support') df = tm.makeTimeDataFrame() # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, table=False, compression='blosc') self.store.put('c', df, table=True, compression='blosc') tm.assert_frame_equal(self.store['c'], df) def test_put_integer(self): # non-date, non-string index df = DataFrame(np.random.randn(50, 100)) self._check_roundtrip(df, tm.assert_frame_equal) def test_append(self): pth = '__test_append__.h5' try: store = HDFStore(pth) df = tm.makeTimeDataFrame() store.append('df1', df[:10]) store.append('df1', df[10:]) tm.assert_frame_equal(store['df1'], df) store.put('df2', df[:10], table=True) store.append('df2', df[10:]) tm.assert_frame_equal(store['df2'], df) store.append('/df3', df[:10]) store.append('/df3', df[10:]) tm.assert_frame_equal(store['df3'], df) # this is allowed by almost always don't want to do it import warnings import tables warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) store.append('/df3 foo', df[:10]) store.append('/df3 foo', df[10:]) tm.assert_frame_equal(store['df3 foo'], df) warnings.filterwarnings('always', category=tables.NaturalNameWarning) wp = tm.makePanel() store.append('wp1', wp.ix[:, :10, :]) store.append('wp1', wp.ix[:, 10:, :]) tm.assert_panel_equal(store['wp1'], wp) except: raise finally: store.close() os.remove(pth) def test_append_with_strings(self): wp = tm.makePanel() wp2 = wp.rename_axis(dict([(x, "%s_extra" % x) for x in wp.minor_axis]), axis=2) self.store.append('s1', wp, min_itemsize=20) self.store.append('s1', wp2) expected = concat([wp, wp2], axis=2) expected = expected.reindex(minor_axis=sorted(expected.minor_axis)) tm.assert_panel_equal(self.store['s1'], expected) # test dict format self.store.append('s2', wp, min_itemsize={'column': 20}) self.store.append('s2', wp2) expected = concat([wp, wp2], axis=2) expected = expected.reindex(minor_axis=sorted(expected.minor_axis)) tm.assert_panel_equal(self.store['s2'], expected) # apply the wrong field (similar to #1) self.store.append('s3', wp, min_itemsize={'index': 20}) self.assertRaises(Exception, self.store.append, 's3') # test truncation of bigger strings self.store.append('s4', wp) self.assertRaises(Exception, self.store.append, 's4', wp2) def test_create_table_index(self): wp = tm.makePanel() self.store.append('p5', wp) self.store.create_table_index('p5') assert (self.store.handle.root.p5.table.cols.index.is_indexed == True) assert ( self.store.handle.root.p5.table.cols.column.is_indexed == False) df = tm.makeTimeDataFrame() self.store.append('f', df[:10]) self.store.append('f', df[10:]) self.store.create_table_index('f') # create twice self.store.create_table_index('f') # try to index a non-table self.store.put('f2', df) self.assertRaises(Exception, self.store.create_table_index, 'f2') # try to change the version supports flag from pandas.io import pytables pytables._table_supports_index = False self.assertRaises(Exception, self.store.create_table_index, 'f') def test_append_diff_item_order(self): wp = tm.makePanel() wp1 = wp.ix[:, :10, :] wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :] self.store.put('panel', wp1, table=True) self.assertRaises(Exception, self.store.put, 'panel', wp2, append=True) def test_table_index_incompatible_dtypes(self): df1 = DataFrame({'a': [1, 2, 3]}) df2 = DataFrame({'a': [4, 5, 6]}, index=date_range('1/1/2000', periods=3)) self.store.put('frame', df1, table=True) self.assertRaises(Exception, self.store.put, 'frame', df2, table=True, append=True) def test_table_values_dtypes_roundtrip(self): df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8') self.store.append('df1', df1) assert df1.dtypes == self.store['df1'].dtypes df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8') self.store.append('df2', df2) assert df2.dtypes == self.store['df2'].dtypes # incompatible dtype self.assertRaises(Exception, self.store.append, 'df2', df1) def test_table_mixed_dtypes(self): # frame def _make_one_df(): df = tm.makeDataFrame() df['obj1'] = 'foo' df['obj2'] = 'bar' df['bool1'] = df['A'] > 0 df['bool2'] = df['B'] > 0 df['bool3'] = True df['int1'] = 1 df['int2'] = 2 return df.consolidate() df1 = _make_one_df() self.store.append('df1_mixed', df1) tm.assert_frame_equal(self.store.select('df1_mixed'), df1) # panel def _make_one_panel(): wp = tm.makePanel() wp['obj1'] = 'foo' wp['obj2'] = 'bar' wp['bool1'] = wp['ItemA'] > 0 wp['bool2'] = wp['ItemB'] > 0 wp['int1'] = 1 wp['int2'] = 2 return wp.consolidate() p1 = _make_one_panel() self.store.append('p1_mixed', p1) tm.assert_panel_equal(self.store.select('p1_mixed'), p1) def test_remove(self): ts = tm.makeTimeSeries() df = tm.makeDataFrame() self.store['a'] = ts self.store['b'] = df self.store.remove('a') self.assertEquals(len(self.store), 1) tm.assert_frame_equal(df, self.store['b']) self.store.remove('b') self.assertEquals(len(self.store), 0) # pathing self.store['a'] = ts self.store['b/foo'] = df self.store.remove('foo') self.store.remove('b/foo') self.assertEquals(len(self.store), 1) self.store['a'] = ts self.store['b/foo'] = df self.store.remove('b') self.assertEquals(len(self.store), 1) # __delitem__ self.store['a'] = ts self.store['b'] = df del self.store['a'] del self.store['b'] self.assertEquals(len(self.store), 0) def test_remove_where(self): # non-existance crit1 = Term('index', '>', 'foo') self.store.remove('a', where=[crit1]) # try to remove non-table (with crit) # non-table ok (where = None) wp = tm.makePanel() self.store.put('wp', wp, table=True) self.store.remove('wp', [('column', ['A', 'D'])]) rs = self.store.select('wp') expected = wp.reindex(minor_axis=['B', 'C']) tm.assert_panel_equal(rs, expected) # empty where self.store.remove('wp') self.store.put('wp', wp, table=True) self.store.remove('wp', []) # non - empty where self.store.remove('wp') self.store.put('wp', wp, table=True) self.assertRaises(Exception, self.store.remove, 'wp', ['foo']) # selectin non-table with a where self.store.put('wp2', wp, table=False) self.assertRaises(Exception, self.store.remove, 'wp2', [('column', ['A', 'D'])]) def test_remove_crit(self): wp = tm.makePanel() self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] crit1 = Term('index', '>', date) crit2 = Term('column', ['A', 'D']) self.store.remove('wp', where=[crit1]) self.store.remove('wp', where=[crit2]) result = self.store['wp'] expected = wp.truncate(after=date).reindex(minor=['B', 'C']) tm.assert_panel_equal(result, expected) # test non-consecutive row removal wp = tm.makePanel() self.store.put('wp2', wp, table=True) date1 = wp.major_axis[1:3] date2 = wp.major_axis[5] date3 = [wp.major_axis[7], wp.major_axis[9]] crit1 = Term('index', date1) crit2 = Term('index', date2) crit3 = Term('index', date3) self.store.remove('wp2', where=[crit1]) self.store.remove('wp2', where=[crit2]) self.store.remove('wp2', where=[crit3]) result = self.store['wp2'] ma = list(wp.major_axis) for d in date1: ma.remove(d) ma.remove(date2) for d in date3: ma.remove(d) expected = wp.reindex(major=ma) tm.assert_panel_equal(result, expected) def test_terms(self): wp = tm.makePanel() self.store.put('wp', wp, table=True) # some invalid terms terms = [ ['minor', ['A', 'B']], ['index', ['20121114']], ['index', ['20121114', '20121114']], ] for t in terms: self.assertRaises(Exception, self.store.select, 'wp', t) self.assertRaises(Exception, Term.__init__) self.assertRaises(Exception, Term.__init__, 'blah') self.assertRaises(Exception, Term.__init__, 'index') self.assertRaises(Exception, Term.__init__, 'index', '==') self.assertRaises(Exception, Term.__init__, 'index', '>', 5) result = self.store.select( 'wp', [Term('major_axis<20000108'), Term('minor_axis', '=', ['A', 'B'])]) expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) tm.assert_panel_equal(result, expected) # valid terms terms = [ dict(field='index', op='>', value='20121114'), ('index', '20121114'), ('index', '>', '20121114'), (('index', ['20121114', '20121114']), ), ('index', datetime(2012, 11, 14)), 'index>20121114', 'major>20121114', 'major_axis>20121114', (('minor', ['A', 'B']), ), (('minor_axis', ['A', 'B']), ), ((('minor_axis', ['A', 'B']), ), ), (('column', ['A', 'B']), ), ] for t in terms: self.store.select('wp', t) def test_series(self): s = tm.makeStringSeries() self._check_roundtrip(s, tm.assert_series_equal) ts = tm.makeTimeSeries() self._check_roundtrip(ts, tm.assert_series_equal) ts2 = Series(ts.index, Index(ts.index, dtype=object)) self._check_roundtrip(ts2, tm.assert_series_equal) ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) self._check_roundtrip(ts3, tm.assert_series_equal) def test_sparse_series(self): s = tm.makeStringSeries() s[3:5] = np.nan ss = s.to_sparse() self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) ss2 = s.to_sparse(kind='integer') self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) ss3 = s.to_sparse(fill_value=0) self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) def test_sparse_frame(self): s = tm.makeDataFrame() s.ix[3:5, 1:3] = np.nan s.ix[8:10, -2] = np.nan ss = s.to_sparse() self._check_double_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) ss2 = s.to_sparse(kind='integer') self._check_double_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) ss3 = s.to_sparse(fill_value=0) self._check_double_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) def test_sparse_panel(self): items = ['x', 'y', 'z'] p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) sp = p.to_sparse() self._check_double_roundtrip(sp, tm.assert_panel_equal, check_panel_type=True) sp2 = p.to_sparse(kind='integer') self._check_double_roundtrip(sp2, tm.assert_panel_equal, check_panel_type=True) sp3 = p.to_sparse(fill_value=0) self._check_double_roundtrip(sp3, tm.assert_panel_equal, check_panel_type=True) def test_float_index(self): # GH #454 index = np.random.randn(10) s = Series(np.random.randn(10), index=index) self._check_roundtrip(s, tm.assert_series_equal) def test_tuple_index(self): # GH #492 col = np.arange(10) idx = [(0., 1.), (2., 3.), (4., 5.)] data = np.random.randn(30).reshape((3, 10)) DF = DataFrame(data, index=idx, columns=col) self._check_roundtrip(DF, tm.assert_frame_equal) def test_index_types(self): values = np.random.randn(2) func = lambda l, r: tm.assert_series_equal(l, r, True, True, True) ser = Series(values, [0, 'y']) self._check_roundtrip(ser, func) ser = Series(values, [datetime.today(), 0]) self._check_roundtrip(ser, func) ser = Series(values, ['y', 0]) self._check_roundtrip(ser, func) from datetime import date ser = Series(values, [date.today(), 'a']) self._check_roundtrip(ser, func) ser = Series(values, [1.23, 'b']) self._check_roundtrip(ser, func) ser = Series(values, [1, 1.53]) self._check_roundtrip(ser, func) ser = Series(values, [1, 5]) self._check_roundtrip(ser, func) ser = Series(values, [datetime(2012, 1, 1), datetime(2012, 1, 2)]) self._check_roundtrip(ser, func) def test_timeseries_preepoch(self): if sys.version_info[0] == 2 and sys.version_info[1] < 7: raise nose.SkipTest dr = bdate_range('1/1/1940', '1/1/1960') ts = Series(np.random.randn(len(dr)), index=dr) try: self._check_roundtrip(ts, tm.assert_series_equal) except OverflowError: raise nose.SkipTest('known failer on some windows platforms') def test_frame(self): df = tm.makeDataFrame() # put in some random NAs df.values[0, 0] = np.nan df.values[5, 3] = np.nan self._check_roundtrip_table(df, tm.assert_frame_equal) self._check_roundtrip(df, tm.assert_frame_equal) self._check_roundtrip_table(df, tm.assert_frame_equal, compression=True) self._check_roundtrip(df, tm.assert_frame_equal, compression=True) tdf = tm.makeTimeDataFrame() self._check_roundtrip(tdf, tm.assert_frame_equal) self._check_roundtrip(tdf, tm.assert_frame_equal, compression=True) # not consolidated df['foo'] = np.random.randn(len(df)) self.store['df'] = df recons = self.store['df'] self.assert_(recons._data.is_consolidated()) # empty self._check_roundtrip(df[:0], tm.assert_frame_equal) def test_empty_series_frame(self): s0 = Series() s1 = Series(name='myseries') df0 = DataFrame() df1 = DataFrame(index=['a', 'b', 'c']) df2 = DataFrame(columns=['d', 'e', 'f']) self._check_roundtrip(s0, tm.assert_series_equal) self._check_roundtrip(s1, tm.assert_series_equal) self._check_roundtrip(df0, tm.assert_frame_equal) self._check_roundtrip(df1, tm.assert_frame_equal) self._check_roundtrip(df2, tm.assert_frame_equal) def test_can_serialize_dates(self): rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')] frame = DataFrame(np.random.randn(len(rng), 4), index=rng) self._check_roundtrip(frame, tm.assert_frame_equal) def test_timezones(self): rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') frame = DataFrame(np.random.randn(len(rng), 4), index=rng) try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] self.assert_(recons.index.equals(rng)) self.assertEquals(rng.tz, recons.index.tz) finally: store.close() os.remove(self.scratchpath) def test_fixed_offset_tz(self): rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00') frame = DataFrame(np.random.randn(len(rng), 4), index=rng) try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] self.assert_(recons.index.equals(rng)) self.assertEquals(rng.tz, recons.index.tz) finally: store.close() os.remove(self.scratchpath) def test_store_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo', 'bar']) frame = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) self._check_roundtrip(frame, tm.assert_frame_equal) self._check_roundtrip(frame.T, tm.assert_frame_equal) self._check_roundtrip(frame['A'], tm.assert_series_equal) # check that the names are stored try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] assert (recons.index.names == ['foo', 'bar']) finally: store.close() os.remove(self.scratchpath) def test_store_index_name(self): df = tm.makeDataFrame() df.index.name = 'foo' try: store = HDFStore(self.scratchpath) store['frame'] = df recons = store['frame'] assert (recons.index.name == 'foo') finally: store.close() os.remove(self.scratchpath) def test_store_series_name(self): df = tm.makeDataFrame() series = df['A'] try: store = HDFStore(self.scratchpath) store['series'] = series recons = store['series'] assert (recons.name == 'A') finally: store.close() os.remove(self.scratchpath) def test_store_mixed(self): def _make_one(): df = tm.makeDataFrame() df['obj1'] = 'foo' df['obj2'] = 'bar' df['bool1'] = df['A'] > 0 df['bool2'] = df['B'] > 0 df['int1'] = 1 df['int2'] = 2 return df.consolidate() df1 = _make_one() df2 = _make_one() self._check_roundtrip(df1, tm.assert_frame_equal) self._check_roundtrip(df2, tm.assert_frame_equal) self.store['obj'] = df1 tm.assert_frame_equal(self.store['obj'], df1) self.store['obj'] = df2 tm.assert_frame_equal(self.store['obj'], df2) # check that can store Series of all of these types self._check_roundtrip(df1['obj1'], tm.assert_series_equal) self._check_roundtrip(df1['bool1'], tm.assert_series_equal) self._check_roundtrip(df1['int1'], tm.assert_series_equal) # try with compression self._check_roundtrip(df1['obj1'], tm.assert_series_equal, compression=True) self._check_roundtrip(df1['bool1'], tm.assert_series_equal, compression=True) self._check_roundtrip(df1['int1'], tm.assert_series_equal, compression=True) self._check_roundtrip(df1, tm.assert_frame_equal, compression=True) def test_wide(self): wp = tm.makePanel() self._check_roundtrip(wp, tm.assert_panel_equal) def test_wide_table(self): wp = tm.makePanel() self._check_roundtrip_table(wp, tm.assert_panel_equal) def test_wide_table_dups(self): wp = tm.makePanel() try: store = HDFStore(self.scratchpath) store._quiet = True store.put('panel', wp, table=True) store.put('panel', wp, table=True, append=True) recons = store['panel'] tm.assert_panel_equal(recons, wp) finally: store.close() os.remove(self.scratchpath) def test_long(self): def _check(left, right): tm.assert_panel_equal(left.to_panel(), right.to_panel()) wp = tm.makePanel() self._check_roundtrip(wp.to_frame(), _check) # empty # self._check_roundtrip(wp.to_frame()[:0], _check) def test_longpanel(self): pass def test_overwrite_node(self): self.store['a'] = tm.makeTimeDataFrame() ts = tm.makeTimeSeries() self.store['a'] = ts tm.assert_series_equal(self.store['a'], ts) def test_select(self): wp = tm.makePanel() # put/select ok self.store.put('wp', wp, table=True) self.store.select('wp') # non-table ok (where = None) self.store.put('wp2', wp, table=False) self.store.select('wp2') # selectin non-table with a where self.assertRaises(Exception, self.store.select, 'wp2', ('column', ['A', 'D'])) def test_panel_select(self): wp = tm.makePanel() self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] crit1 = ('index', '>=', date) crit2 = ('column', '=', ['A', 'D']) result = self.store.select('wp', [crit1, crit2]) expected = wp.truncate(before=date).reindex(minor=['A', 'D']) tm.assert_panel_equal(result, expected) result = self.store.select( 'wp', ['major_axis>=20000124', ('minor_axis', '=', ['A', 'B'])]) expected = wp.truncate(before='20000124').reindex(minor=['A', 'B']) tm.assert_panel_equal(result, expected) def test_frame_select(self): df = tm.makeTimeDataFrame() self.store.put('frame', df, table=True) date = df.index[len(df) // 2] crit1 = ('index', '>=', date) crit2 = ('column', ['A', 'D']) crit3 = ('column', 'A') result = self.store.select('frame', [crit1, crit2]) expected = df.ix[date:, ['A', 'D']] tm.assert_frame_equal(result, expected) result = self.store.select('frame', [crit3]) expected = df.ix[:, ['A']] tm.assert_frame_equal(result, expected) # can't select if not written as table self.store['frame'] = df self.assertRaises(Exception, self.store.select, 'frame', [crit1, crit2]) def test_select_filter_corner(self): df = DataFrame(np.random.randn(50, 100)) df.index = ['%.3d' % c for c in df.index] df.columns = ['%.3d' % c for c in df.columns] self.store.put('frame', df, table=True) crit = Term('column', df.columns[:75]) result = self.store.select('frame', [crit]) tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj, **kwargs) finally: store.close() os.remove(self.scratchpath) def _check_double_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj, **kwargs) store['obj'] = retrieved again = store['obj'] comparator(again, obj, **kwargs) finally: store.close() os.remove(self.scratchpath) def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store.put('obj', obj, table=True) retrieved = store['obj'] # sorted_obj = _test_sort(obj) comparator(retrieved, obj) finally: store.close() os.remove(self.scratchpath) def test_legacy_read(self): pth = curpath() store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r') store['a'] store['b'] store['c'] store['d'] store.close() def test_legacy_table_read(self): # legacy table types pth = curpath() store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r') store.select('df1') store.select('df2') store.select('wp1') store.close() def test_legacy_table_write(self): # legacy table types pth = curpath() df = tm.makeDataFrame() wp = tm.makePanel() store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'a') self.assertRaises(Exception, store.append, 'df1', df) self.assertRaises(Exception, store.append, 'wp1', wp) store.close() def test_store_datetime_fractional_secs(self): dt = datetime(2012, 1, 2, 3, 4, 5, 123456) series = Series([0], [dt]) self.store['a'] = series self.assertEquals(self.store['a'].index[0], dt) def test_tseries_indices_series(self): idx = tm.makeDateIndex(10) ser = Series(np.random.randn(len(idx)), idx) self.store['a'] = ser result = self.store['a'] assert_series_equal(result, ser) self.assertEquals(type(result.index), type(ser.index)) self.assertEquals(result.index.freq, ser.index.freq) idx = tm.makePeriodIndex(10) ser = Series(np.random.randn(len(idx)), idx) self.store['a'] = ser result = self.store['a'] assert_series_equal(result, ser) self.assertEquals(type(result.index), type(ser.index)) self.assertEquals(result.index.freq, ser.index.freq) def test_tseries_indices_frame(self): idx = tm.makeDateIndex(10) df = DataFrame(np.random.randn(len(idx), 3), index=idx) self.store['a'] = df result = self.store['a'] assert_frame_equal(result, df) self.assertEquals(type(result.index), type(df.index)) self.assertEquals(result.index.freq, df.index.freq) idx = tm.makePeriodIndex(10) df = DataFrame(np.random.randn(len(idx), 3), idx) self.store['a'] = df result = self.store['a'] assert_frame_equal(result, df) self.assertEquals(type(result.index), type(df.index)) self.assertEquals(result.index.freq, df.index.freq) def test_unicode_index(self): unicode_values = [u'\u03c3', u'\u03c3\u03c3'] s = Series(np.random.randn(len(unicode_values)), unicode_values) self._check_roundtrip(s, tm.assert_series_equal) def test_store_datetime_mixed(self): df = DataFrame({ 'a': [1, 2, 3], 'b': [1., 2., 3.], 'c': ['a', 'b', 'c'] }) ts = tm.makeTimeSeries() df['d'] = ts.index[:3] self._check_roundtrip(df, tm.assert_frame_equal) def test_cant_write_multiindex_table(self): # for now, #1848 df = DataFrame( np.random.randn(10, 4), index=[np.arange(5).repeat(2), np.tile(np.arange(2), 5)]) self.assertRaises(Exception, self.store.put, 'foo', df, table=True)
def del_col_from_summary(colname): store = HDFStore('_data_/ProteinDataStore.h5') data_summary = store['DataBases_Summary'] data_summary = data_summary.drop(colname, 1) store['DataBases_Summary'] = data_summary store.close()
def filter_columns(storename, dataframe, collist): store = HDFStore(storename) df = store[dataframe] df = df[collist] store[dataframe] = df store.close()
class Importer: """Main Class for importing mat files. Data will be stored in the samples and targets of the ds dictionary attribute and can be loaded or saved from and to a compressed hdf5 file. Attributes: dataroot: directory that contains the mat files ds: trivial dictionary containing: samples: a pandas dataframe with a MultiIndex composed of the session data targets: a pandas dataframe with the targets/labels """ def __init__(self, dataroot): self.dataroot = dataroot self.ds = None self.store = None self.importpath = path.join(self.dataroot, 'imported') def __append(self, session): """append session to current object""" if (self.ds.samples.columns.get_level_values('channel') != session.ds.samples.columns.get_level_values('channel')).any(): print(self.ds.samples.columns.get_level_values('channel')) print(session.ds.samples.columns.get_level_values('channel')) raise InconsistentElectrodes( 'electrode labels do not match when merging datasets') self.ds.samples = self.ds.samples.append(session.ds.samples, verify_integrity=True) self.ds.targets = self.ds.targets.append(session.ds.targets, verify_integrity=True) def __sort(self): """MultiIndex Slicing operations require we sort all indices""" self.ds.samples.sort_index(level='channel', axis=1, inplace=True) #self.ds.samples.sort_index(axis=1, inplace=True) def get_session(self, subject, sessionid): """Add a single trial as target/samples pair from a mat file. Args: param1: (string): subject ID param2: (string): session ID Returns: A Session object containing samples and target data. """ trialpath = glob( path.join(self.dataroot, subject + '-' + sessionid + '-*.mat')) if not trialpath or not path.exists(trialpath[0]): raise FileNotFoundError( "no file for subject '{0}' and trial '{1}'".format( subject, sessionid)) session = io.loadmat(trialpath[0]) """ >> session session = data: [1x1 struct] """ data = session['data'][0, 0] """ >> session.data ans = label: {64x1 cell} time: {1x638 cell} trial: {1x638 cell} elec: [1x1 struct] cfg: [1x1 struct] TrlInfo: {638x16 cell} TrlInfoLabels: {16x1 cell} """ channels = data[0][:, 0] # label channels = np.array(channels.tolist()).flatten() # unify dtype samples = data[2][0, :] # trial samples = np.array([x[:].flatten() for x in samples], dtype='float32') cfg = data[4][0][0] trlinfo = data[5][:, :] trlinfolabels = data[6][:, 0] """ >> session.data.cfg ans = method: 'spline' badchannel: {2x1 cell} trials: 'all' lambda: 1.0000e-05 order: 4 elec: [1x1 struct] outputfilepresent: 'overwrite' callinfo: [1x1 struct] version: [1x1 struct] trackconfig: 'off' checkconfig: 'loose' checksize: 100000 showcallinfo: 'yes' debug: 'no' trackcallinfo: 'yes' trackdatainfo: 'no' missingchannel: {0x1 cell} previous: [1x1 struct] """ try: badchannels = cfg[1][0, 0] except IndexError: badchannels = [] """ >> session.data.TrlInfoLabels ans = 'time stamp original (EEG)' 'time stamp new (EEG)' 'task' 'data part #' 'trial #' 'stimulus type' 'EEG trigger' 'encoding digit #' 'time stamp original (E-Prime)' 'set size' 'probe type' 'response' 'ACC' 'RT' 'digit/probe presented' 'probe position' """ trials = trlinfo[:, 4].astype('uint8') digits = trlinfo[:, 14].astype('uint8') return Session(subject, sessionid, samples, digits, trials, channels) def add_session(self, subject, sessionid): """Concatenate a single Session to the current importer instance implicitly using __append(). Args: param1: (string): subject ID param2: (string): session ID """ session = self.get_session(subject, sessionid) if not self.ds: self.ds = dotdict({ 'samples': session.ds.samples, 'targets': session.ds.targets }) return if sessionid in self.ds.samples.index.get_level_values('session'): warnings.warn("Session already added, doing nothing.") return if subject not in self.ds.samples.index.get_level_values('subject'): raise UnmatchedSubjects( "Subjects don't match, will not add current session") # TODO: other checks ? self.__append(session) def import_all(self, subject): """Import all .mat files for a subject ID. Args: param1: (string): subject ID """ trialpath = path.join(self.dataroot, '*' + subject + '*mat') trialfiles = sorted(glob(trialpath)) if not trialfiles: raise FileNotFoundError(trialpath) sessionid_re = re.compile('.*' + subject + '-([0-9]+)-.*mat') sessionids = [ sessionid_re.match(file).groups()[0] for file in trialfiles ] for id in sessionids: self.add_session(subject, id) self.__sort() def save(self, filename, force=False): """Save the trials and samples arrays from the current importer instance to a dataset inside a lzf compressed hdf5 file for later use. Args: param1: (string): filename, will be stored in self.importpath Optional Args: force: (boolean) Wether or not to overwrite an existing file (default: False) """ try: mkdir(self.importpath) except FileExistsError: pass filename = path.join(self.importpath, filename) if path.exists(filename): if force: unlink(filename) else: raise FileExistsError('Import file "' + filename + '" already exists.') self.__sort() self.store = HDFStore(filename, complib='lzo') self.store['samples'] = self.ds.samples self.store['targets'] = self.ds.targets self.store.close() def load(self, name): """Load a hdf5 file created with save() and attach the targets and samples array to the current importer instance. Args: param1: (string): a name for the dataset and the hdf5 file name """ self.open(name) self.ds = dotdict({'samples': None, 'targets': None}) self.ds.samples = self.store['samples'] self.ds.targets = self.store['targets'] self.store.close() def open(self, name): if not path.exists(self.importpath): raise FileNotFoundError(path.join(self.dataroot, 'imported')) filename = path.join(self.importpath, name) if not path.exists(filename): raise FileExistsError(filename) self.store = HDFStore(filename) def close(self, name): self.store.close()
class TestHDFStore(unittest.TestCase): path = '__test__.h5' scratchpath = '__scratch__.h5' def setUp(self): self.store = HDFStore(self.path) def tearDown(self): self.store.close() os.remove(self.path) def test_factory_fun(self): try: with get_store(self.scratchpath) as tbl: raise ValueError('blah') except ValueError: pass with get_store(self.scratchpath) as tbl: tbl['a'] = tm.makeDataFrame() with get_store(self.scratchpath) as tbl: self.assertEquals(len(tbl), 1) self.assertEquals(type(tbl['a']), DataFrame) os.remove(self.scratchpath) def test_len_keys(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() self.assertEquals(len(self.store), 4) self.assert_(set(self.store.keys()) == set(['a', 'b', 'c', 'd'])) def test_repr(self): repr(self.store) self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() repr(self.store) def test_contains(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeDataFrame() self.assert_('a' in self.store) self.assert_('b' in self.store) self.assert_('c' not in self.store) def test_reopen_handle(self): self.store['a'] = tm.makeTimeSeries() self.store.open('w', warn=False) self.assert_(self.store.handle.isopen) self.assertEquals(len(self.store), 0) def test_flush(self): self.store['a'] = tm.makeTimeSeries() self.store.flush() def test_get(self): self.store['a'] = tm.makeTimeSeries() left = self.store.get('a') right = self.store['a'] tm.assert_series_equal(left, right) self.assertRaises(KeyError, self.store.get, 'b') def test_put(self): ts = tm.makeTimeSeries() df = tm.makeTimeDataFrame() self.store['a'] = ts self.store['b'] = df[:10] self.store.put('c', df[:10], table=True) # not OK, not a table self.assertRaises(ValueError, self.store.put, 'b', df[10:], append=True) # node does not currently exist, test _is_table_type returns False in # this case self.assertRaises(ValueError, self.store.put, 'f', df[10:], append=True) # OK self.store.put('c', df[10:], append=True) # overwrite table self.store.put('c', df[:10], table=True, append=False) tm.assert_frame_equal(df[:10], self.store['c']) def test_put_compression(self): df = tm.makeTimeDataFrame() self.store.put('c', df, table=True, compression='zlib') tm.assert_frame_equal(self.store['c'], df) # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, table=False, compression='zlib') def test_put_compression_blosc(self): tm.skip_if_no_package('tables', '2.2', app='blosc support') df = tm.makeTimeDataFrame() # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, table=False, compression='blosc') self.store.put('c', df, table=True, compression='blosc') tm.assert_frame_equal(self.store['c'], df) def test_put_integer(self): # non-date, non-string index df = DataFrame(np.random.randn(50, 100)) self._check_roundtrip(df, tm.assert_frame_equal) def test_append(self): df = tm.makeTimeDataFrame() self.store.put('c', df[:10], table=True) self.store.append('c', df[10:]) tm.assert_frame_equal(self.store['c'], df) def test_append_diff_item_order(self): wp = tm.makePanel() wp1 = wp.ix[:, :10, :] wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :] self.store.put('panel', wp1, table=True) self.assertRaises(Exception, self.store.put, 'panel', wp2, append=True) def test_remove(self): ts = tm.makeTimeSeries() df = tm.makeDataFrame() self.store['a'] = ts self.store['b'] = df self.store.remove('a') self.assertEquals(len(self.store), 1) tm.assert_frame_equal(df, self.store['b']) self.store.remove('b') self.assertEquals(len(self.store), 0) def test_remove_where_not_exist(self): crit1 = { 'field' : 'index', 'op' : '>', 'value' : 'foo' } self.store.remove('a', where=[crit1]) def test_remove_crit(self): wp = tm.makePanel() self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] crit1 = { 'field' : 'index', 'op' : '>', 'value' : date } crit2 = { 'field' : 'column', 'value' : ['A', 'D'] } self.store.remove('wp', where=[crit1]) self.store.remove('wp', where=[crit2]) result = self.store['wp'] expected = wp.truncate(after=date).reindex(minor=['B', 'C']) tm.assert_panel_equal(result, expected) def test_series(self): s = tm.makeStringSeries() self._check_roundtrip(s, tm.assert_series_equal) ts = tm.makeTimeSeries() self._check_roundtrip(ts, tm.assert_series_equal) ts2 = Series(ts.index, Index(ts.index, dtype=object)) self._check_roundtrip(ts2, tm.assert_series_equal) ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) self._check_roundtrip(ts3, tm.assert_series_equal) def test_sparse_series(self): s = tm.makeStringSeries() s[3:5] = np.nan ss = s.to_sparse() self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) ss2 = s.to_sparse(kind='integer') self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) ss3 = s.to_sparse(fill_value=0) self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) def test_sparse_frame(self): s = tm.makeDataFrame() s.ix[3:5, 1:3] = np.nan s.ix[8:10, -2] = np.nan ss = s.to_sparse() self._check_double_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) ss2 = s.to_sparse(kind='integer') self._check_double_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) ss3 = s.to_sparse(fill_value=0) self._check_double_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) def test_sparse_panel(self): items = ['x', 'y', 'z'] p = Panel(dict((i, tm.makeDataFrame()) for i in items)) sp = p.to_sparse() self._check_double_roundtrip(sp, tm.assert_panel_equal, check_panel_type=True) sp2 = p.to_sparse(kind='integer') self._check_double_roundtrip(sp2, tm.assert_panel_equal, check_panel_type=True) sp3 = p.to_sparse(fill_value=0) self._check_double_roundtrip(sp3, tm.assert_panel_equal, check_panel_type=True) def test_float_index(self): # GH #454 index = np.random.randn(10) s = Series(np.random.randn(10), index=index) self._check_roundtrip(s, tm.assert_series_equal) def test_tuple_index(self): # GH #492 col = np.arange(10) idx = [(0.,1.), (2., 3.), (4., 5.)] data = np.random.randn(30).reshape((3, 10)) DF = DataFrame(data, index=idx, columns=col) self._check_roundtrip(DF, tm.assert_frame_equal) def test_index_types(self): values = np.random.randn(2) func = lambda l, r : tm.assert_series_equal(l, r, True, True, True) ser = Series(values, [0, 'y']) self._check_roundtrip(ser, func) ser = Series(values, [datetime.today(), 0]) self._check_roundtrip(ser, func) ser = Series(values, ['y', 0]) self._check_roundtrip(ser, func) from datetime import date ser = Series(values, [date.today(), 'a']) self._check_roundtrip(ser, func) ser = Series(values, [1.23, 'b']) self._check_roundtrip(ser, func) ser = Series(values, [1, 1.53]) self._check_roundtrip(ser, func) ser = Series(values, [1, 5]) self._check_roundtrip(ser, func) ser = Series(values, [datetime(2012, 1, 1), datetime(2012, 1, 2)]) self._check_roundtrip(ser, func) def test_timeseries_preepoch(self): if sys.version_info[0] == 2 and sys.version_info[1] < 7: raise nose.SkipTest dr = bdate_range('1/1/1940', '1/1/1960') ts = Series(np.random.randn(len(dr)), index=dr) try: self._check_roundtrip(ts, tm.assert_series_equal) except OverflowError: raise nose.SkipTest('known failer on some windows platforms') def test_frame(self): df = tm.makeDataFrame() # put in some random NAs df.values[0, 0] = np.nan df.values[5, 3] = np.nan self._check_roundtrip_table(df, tm.assert_frame_equal) self._check_roundtrip(df, tm.assert_frame_equal) self._check_roundtrip_table(df, tm.assert_frame_equal, compression=True) self._check_roundtrip(df, tm.assert_frame_equal, compression=True) tdf = tm.makeTimeDataFrame() self._check_roundtrip(tdf, tm.assert_frame_equal) self._check_roundtrip(tdf, tm.assert_frame_equal, compression=True) # not consolidated df['foo'] = np.random.randn(len(df)) self.store['df'] = df recons = self.store['df'] self.assert_(recons._data.is_consolidated()) # empty self._check_roundtrip(df[:0], tm.assert_frame_equal) def test_empty_series_frame(self): s0 = Series() s1 = Series(name='myseries') df0 = DataFrame() df1 = DataFrame(index=['a', 'b', 'c']) df2 = DataFrame(columns=['d', 'e', 'f']) self._check_roundtrip(s0, tm.assert_series_equal) self._check_roundtrip(s1, tm.assert_series_equal) self._check_roundtrip(df0, tm.assert_frame_equal) self._check_roundtrip(df1, tm.assert_frame_equal) self._check_roundtrip(df2, tm.assert_frame_equal) def test_can_serialize_dates(self): rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')] frame = DataFrame(np.random.randn(len(rng), 4), index=rng) self._check_roundtrip(frame, tm.assert_frame_equal) def test_timezones(self): rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') frame = DataFrame(np.random.randn(len(rng), 4), index=rng) try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] self.assert_(recons.index.equals(rng)) self.assertEquals(rng.tz, recons.index.tz) finally: store.close() os.remove(self.scratchpath) def test_store_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo', 'bar']) frame = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) self._check_roundtrip(frame, tm.assert_frame_equal) self._check_roundtrip(frame.T, tm.assert_frame_equal) self._check_roundtrip(frame['A'], tm.assert_series_equal) # check that the names are stored try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] assert(recons.index.names == ['foo', 'bar']) finally: store.close() os.remove(self.scratchpath) def test_store_index_name(self): df = tm.makeDataFrame() df.index.name = 'foo' try: store = HDFStore(self.scratchpath) store['frame'] = df recons = store['frame'] assert(recons.index.name == 'foo') finally: store.close() os.remove(self.scratchpath) def test_store_series_name(self): df = tm.makeDataFrame() series = df['A'] try: store = HDFStore(self.scratchpath) store['series'] = series recons = store['series'] assert(recons.name == 'A') finally: store.close() os.remove(self.scratchpath) def test_store_mixed(self): def _make_one(): df = tm.makeDataFrame() df['obj1'] = 'foo' df['obj2'] = 'bar' df['bool1'] = df['A'] > 0 df['bool2'] = df['B'] > 0 df['int1'] = 1 df['int2'] = 2 return df.consolidate() df1 = _make_one() df2 = _make_one() self._check_roundtrip(df1, tm.assert_frame_equal) self._check_roundtrip(df2, tm.assert_frame_equal) self.store['obj'] = df1 tm.assert_frame_equal(self.store['obj'], df1) self.store['obj'] = df2 tm.assert_frame_equal(self.store['obj'], df2) # storing in Table not yet supported self.assertRaises(Exception, self.store.put, 'foo', df1, table=True) # check that can store Series of all of these types self._check_roundtrip(df1['obj1'], tm.assert_series_equal) self._check_roundtrip(df1['bool1'], tm.assert_series_equal) self._check_roundtrip(df1['int1'], tm.assert_series_equal) # try with compression self._check_roundtrip(df1['obj1'], tm.assert_series_equal, compression=True) self._check_roundtrip(df1['bool1'], tm.assert_series_equal, compression=True) self._check_roundtrip(df1['int1'], tm.assert_series_equal, compression=True) self._check_roundtrip(df1, tm.assert_frame_equal, compression=True) def test_wide(self): wp = tm.makePanel() self._check_roundtrip(wp, tm.assert_panel_equal) def test_wide_table(self): wp = tm.makePanel() self._check_roundtrip_table(wp, tm.assert_panel_equal) def test_wide_table_dups(self): wp = tm.makePanel() try: store = HDFStore(self.scratchpath) store._quiet = True store.put('panel', wp, table=True) store.put('panel', wp, table=True, append=True) recons = store['panel'] tm.assert_panel_equal(recons, wp) finally: store.close() os.remove(self.scratchpath) def test_long(self): def _check(left, right): tm.assert_panel_equal(left.to_panel(), right.to_panel()) wp = tm.makePanel() self._check_roundtrip(wp.to_frame(), _check) # empty # self._check_roundtrip(wp.to_frame()[:0], _check) def test_longpanel(self): pass def test_overwrite_node(self): self.store['a'] = tm.makeTimeDataFrame() ts = tm.makeTimeSeries() self.store['a'] = ts tm.assert_series_equal(self.store['a'], ts) def test_panel_select(self): wp = tm.makePanel() self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] crit1 = { 'field' : 'index', 'op' : '>=', 'value' : date } crit2 = { 'field' : 'column', 'value' : ['A', 'D'] } result = self.store.select('wp', [crit1, crit2]) expected = wp.truncate(before=date).reindex(minor=['A', 'D']) tm.assert_panel_equal(result, expected) def test_frame_select(self): df = tm.makeTimeDataFrame() self.store.put('frame', df, table=True) date = df.index[len(df) // 2] crit1 = { 'field' : 'index', 'op' : '>=', 'value' : date } crit2 = { 'field' : 'column', 'value' : ['A', 'D'] } crit3 = { 'field' : 'column', 'value' : 'A' } result = self.store.select('frame', [crit1, crit2]) expected = df.ix[date:, ['A', 'D']] tm.assert_frame_equal(result, expected) result = self.store.select('frame', [crit3]) expected = df.ix[:, ['A']] tm.assert_frame_equal(result, expected) # can't select if not written as table self.store['frame'] = df self.assertRaises(Exception, self.store.select, 'frame', [crit1, crit2]) def test_select_filter_corner(self): df = DataFrame(np.random.randn(50, 100)) df.index = ['%.3d' % c for c in df.index] df.columns = ['%.3d' % c for c in df.columns] self.store.put('frame', df, table=True) crit = { 'field' : 'column', 'value' : df.columns[:75] } result = self.store.select('frame', [crit]) tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj, **kwargs) finally: store.close() os.remove(self.scratchpath) def _check_double_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj, **kwargs) store['obj'] = retrieved again = store['obj'] comparator(again, obj, **kwargs) finally: store.close() os.remove(self.scratchpath) def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store.put('obj', obj, table=True) retrieved = store['obj'] sorted_obj = _test_sort(obj) comparator(retrieved, sorted_obj) finally: store.close() os.remove(self.scratchpath) def test_legacy_read(self): pth = curpath() store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r') store['a'] store['b'] store['c'] store['d'] store.close() def test_store_datetime_fractional_secs(self): dt = datetime(2012, 1, 2, 3, 4, 5, 123456) series = Series([0], [dt]) self.store['a'] = series self.assertEquals(self.store['a'].index[0], dt) def test_tseries_indices_series(self): idx = tm.makeDateIndex(10) ser = Series(np.random.randn(len(idx)), idx) self.store['a'] = ser result = self.store['a'] assert_series_equal(result, ser) self.assertEquals(type(result.index), type(ser.index)) self.assertEquals(result.index.freq, ser.index.freq) idx = tm.makePeriodIndex(10) ser = Series(np.random.randn(len(idx)), idx) self.store['a'] = ser result = self.store['a'] assert_series_equal(result, ser) self.assertEquals(type(result.index), type(ser.index)) self.assertEquals(result.index.freq, ser.index.freq) def test_tseries_indices_frame(self): idx = tm.makeDateIndex(10) df = DataFrame(np.random.randn(len(idx), 3), index=idx) self.store['a'] = df result = self.store['a'] assert_frame_equal(result, df) self.assertEquals(type(result.index), type(df.index)) self.assertEquals(result.index.freq, df.index.freq) idx = tm.makePeriodIndex(10) df = DataFrame(np.random.randn(len(idx), 3), idx) self.store['a'] = df result = self.store['a'] assert_frame_equal(result, df) self.assertEquals(type(result.index), type(df.index)) self.assertEquals(result.index.freq, df.index.freq) def test_unicode_index(self): unicode_values = [u'\u03c3', u'\u03c3\u03c3'] s = Series(np.random.randn(len(unicode_values)), unicode_values) self._check_roundtrip(s, tm.assert_series_equal) def test_store_datetime_mixed(self): df = DataFrame({'a': [1,2,3], 'b': [1.,2.,3.], 'c': ['a', 'b', 'c']}) ts = tm.makeTimeSeries() df['d'] = ts.index[:3] self._check_roundtrip(df, tm.assert_frame_equal) def test_cant_write_multiindex_table(self): # for now, #1848 df = DataFrame(np.random.randn(10, 4), index=[np.arange(5).repeat(2), np.tile(np.arange(2), 5)]) self.assertRaises(Exception, self.store.put, 'foo', df, table=True)
def hdfWrite(self, path, excode, symbol, indata, kind1, kind2, kind3): # kind1为 'Rawdata'、'Stitch'、'Indicator' # kind2为 '00' '01' # kind3为 '1d' '60m' '30m' '15m' '5m' '1m' # 写各个频率的Rawdata: kind1='Rawdata',kind2=None,kind3='1d' # 写StitchRule: kind1='Stitch', kind2='00',kind3=None # 写StitchData: kind1='Stitch', kind2='00',kind3='1d' # 写Indicator: kind1='Indicator',kind2='Indicator_name',kind3='params' store = HDFStore(path, mode='a') if kind1 == EXT_Rawdata: key = '/'.join([kind1, excode, symbol, kind3]) elif kind1 == EXT_Stitch: key = '/'.join([kind1, excode, symbol, EXT_Rule, kind2 ]) if kind3 == None else '/'.join([ kind1, excode, symbol, EXT_Period, kind3, kind2 ]) elif kind1 == EXT_Indicator: key = '/'.join([kind1, excode, symbol, kind2]) else: print("kind not supported") return if kind1 == EXT_Indicator: f = h5py.File(path, 'a') try: store[key] except KeyError: # 路径不存在时创建 store[key] = indata for param_names, value in kind3.items(): f[key].attrs['param_names'] = value else: temp = 0 try: f[key].attrs[[i for i in kind3.keys()][0]] #不存在该参数 except KeyError: store[key] = indata for param_names, value in kind3.items(): f[key].attrs['param_names'] = value else: for param_names, value in kind3.items(): temp = (f[key].attrs[param_names] != value) + temp if temp == 0: #Params匹配时合并 adddata = indata[~indata.index.isin(store[key].index)] store.append(key, adddata) else: # Params不匹配时覆盖 store[key] = indata for param_names, value in kind3.items(): f[key].attrs['param_names'] = value f.close() store.close() else: try: store[key] except KeyError: store[key] = indata else: adddata = indata[~indata.index.isin(store[key].index)] if kind2 in [EXT_Series_00, EXT_Series_01]: adddata[EXT_AdjFactor] = adddata[EXT_AdjFactor] * store[ key][EXT_AdjFactor].iloc[-1] / adddata[ EXT_AdjFactor].iloc[0] store.append(key, adddata) store.close()
class TestHDFStore(unittest.TestCase): path = '__test__.h5' scratchpath = '__scratch__.h5' def setUp(self): self.store = HDFStore(self.path) def tearDown(self): self.store.close() os.remove(self.path) def test_factory_fun(self): try: with get_store(self.scratchpath) as tbl: raise ValueError('blah') except ValueError: pass with get_store(self.scratchpath) as tbl: tbl['a'] = tm.makeDataFrame() with get_store(self.scratchpath) as tbl: self.assertEquals(len(tbl), 1) self.assertEquals(type(tbl['a']), DataFrame) os.remove(self.scratchpath) def test_keys(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() self.store['foo/bar'] = tm.makePanel() self.assertEquals(len(self.store), 5) self.assert_( set(self.store.keys()) == set(['/a', '/b', '/c', '/d', '/foo/bar' ])) def test_repr(self): repr(self.store) self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() self.store['foo/bar'] = tm.makePanel() self.store.append('e', tm.makePanel()) repr(self.store) str(self.store) def test_contains(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeDataFrame() self.store['foo/bar'] = tm.makeDataFrame() self.assert_('a' in self.store) self.assert_('b' in self.store) self.assert_('c' not in self.store) self.assert_('foo/bar' in self.store) self.assert_('/foo/bar' in self.store) self.assert_('/foo/b' not in self.store) self.assert_('bar' not in self.store) def test_versioning(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeDataFrame() df = tm.makeTimeDataFrame() self.store.remove('df1') self.store.append('df1', df[:10]) self.store.append('df1', df[10:]) self.assert_(self.store.root.a._v_attrs.pandas_version == '0.10') self.assert_(self.store.root.b._v_attrs.pandas_version == '0.10') self.assert_(self.store.root.df1._v_attrs.pandas_version == '0.10') # write a file and wipe its versioning self.store.remove('df2') self.store.append('df2', df) self.store.get_node('df2')._v_attrs.pandas_version = None self.store.select('df2') self.store.select('df2', [Term('index', '>', df.index[2])]) def test_meta(self): raise nose.SkipTest('no meta') meta = {'foo': ['I love pandas ']} s = tm.makeTimeSeries() s.meta = meta self.store['a'] = s self.assert_(self.store['a'].meta == meta) df = tm.makeDataFrame() df.meta = meta self.store['b'] = df self.assert_(self.store['b'].meta == meta) # this should work, but because slicing doesn't propgate meta it doesn self.store.remove('df1') self.store.append('df1', df[:10]) self.store.append('df1', df[10:]) results = self.store['df1'] #self.assert_(getattr(results,'meta',None) == meta) # no meta df = tm.makeDataFrame() self.store['b'] = df self.assert_(hasattr(self.store['b'], 'meta') == False) def test_reopen_handle(self): self.store['a'] = tm.makeTimeSeries() self.store.open('w', warn=False) self.assert_(self.store.handle.isopen) self.assertEquals(len(self.store), 0) def test_flush(self): self.store['a'] = tm.makeTimeSeries() self.store.flush() def test_get(self): self.store['a'] = tm.makeTimeSeries() left = self.store.get('a') right = self.store['a'] tm.assert_series_equal(left, right) left = self.store.get('/a') right = self.store['/a'] tm.assert_series_equal(left, right) self.assertRaises(KeyError, self.store.get, 'b') def test_put(self): ts = tm.makeTimeSeries() df = tm.makeTimeDataFrame() self.store['a'] = ts self.store['b'] = df[:10] self.store['foo/bar/bah'] = df[:10] self.store['foo'] = df[:10] self.store['/foo'] = df[:10] self.store.put('c', df[:10], table=True) # not OK, not a table self.assertRaises(ValueError, self.store.put, 'b', df[10:], append=True) # node does not currently exist, test _is_table_type returns False in # this case self.assertRaises(ValueError, self.store.put, 'f', df[10:], append=True) # OK self.store.put('c', df[10:], append=True) # overwrite table self.store.put('c', df[:10], table=True, append=False) tm.assert_frame_equal(df[:10], self.store['c']) def test_put_string_index(self): index = Index( ["I am a very long string index: %s" % i for i in range(20)]) s = Series(np.arange(20), index=index) df = DataFrame({'A': s, 'B': s}) self.store['a'] = s tm.assert_series_equal(self.store['a'], s) self.store['b'] = df tm.assert_frame_equal(self.store['b'], df) # mixed length index = Index( ['abcdefghijklmnopqrstuvwxyz1234567890'] + ["I am a very long string index: %s" % i for i in range(20)]) s = Series(np.arange(21), index=index) df = DataFrame({'A': s, 'B': s}) self.store['a'] = s tm.assert_series_equal(self.store['a'], s) self.store['b'] = df tm.assert_frame_equal(self.store['b'], df) def test_put_compression(self): df = tm.makeTimeDataFrame() self.store.put('c', df, table=True, compression='zlib') tm.assert_frame_equal(self.store['c'], df) # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, table=False, compression='zlib') def test_put_compression_blosc(self): tm.skip_if_no_package('tables', '2.2', app='blosc support') df = tm.makeTimeDataFrame() # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, table=False, compression='blosc') self.store.put('c', df, table=True, compression='blosc') tm.assert_frame_equal(self.store['c'], df) def test_put_integer(self): # non-date, non-string index df = DataFrame(np.random.randn(50, 100)) self._check_roundtrip(df, tm.assert_frame_equal) def test_append(self): df = tm.makeTimeDataFrame() self.store.remove('df1') self.store.append('df1', df[:10]) self.store.append('df1', df[10:]) tm.assert_frame_equal(self.store['df1'], df) self.store.remove('df2') self.store.put('df2', df[:10], table=True) self.store.append('df2', df[10:]) tm.assert_frame_equal(self.store['df2'], df) self.store.remove('df3') self.store.append('/df3', df[:10]) self.store.append('/df3', df[10:]) tm.assert_frame_equal(self.store['df3'], df) # this is allowed by almost always don't want to do it warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) self.store.remove('/df3 foo') self.store.append('/df3 foo', df[:10]) self.store.append('/df3 foo', df[10:]) tm.assert_frame_equal(self.store['df3 foo'], df) warnings.filterwarnings('always', category=tables.NaturalNameWarning) # panel wp = tm.makePanel() self.store.remove('wp1') self.store.append('wp1', wp.ix[:, :10, :]) self.store.append('wp1', wp.ix[:, 10:, :]) tm.assert_panel_equal(self.store['wp1'], wp) # ndim p4d = tm.makePanel4D() self.store.remove('p4d') self.store.append('p4d', p4d.ix[:, :, :10, :]) self.store.append('p4d', p4d.ix[:, :, 10:, :]) tm.assert_panel4d_equal(self.store['p4d'], p4d) # test using axis labels self.store.remove('p4d') self.store.append('p4d', p4d.ix[:, :, :10, :], axes=['items', 'major_axis', 'minor_axis']) self.store.append('p4d', p4d.ix[:, :, 10:, :], axes=['items', 'major_axis', 'minor_axis']) tm.assert_panel4d_equal(self.store['p4d'], p4d) # test using differnt number of items on each axis p4d2 = p4d.copy() p4d2['l4'] = p4d['l1'] p4d2['l5'] = p4d['l1'] self.store.remove('p4d2') self.store.append('p4d2', p4d2, axes=['items', 'major_axis', 'minor_axis']) tm.assert_panel4d_equal(self.store['p4d2'], p4d2) # test using differt order of items on the non-index axes self.store.remove('wp1') wp_append1 = wp.ix[:, :10, :] self.store.append('wp1', wp_append1) wp_append2 = wp.ix[:, 10:, :].reindex(items=wp.items[::-1]) self.store.append('wp1', wp_append2) tm.assert_panel_equal(self.store['wp1'], wp) def test_append_frame_column_oriented(self): # column oriented df = tm.makeTimeDataFrame() self.store.remove('df1') self.store.append('df1', df.ix[:, :2], axes=['columns']) self.store.append('df1', df.ix[:, 2:]) tm.assert_frame_equal(self.store['df1'], df) result = self.store.select('df1', 'columns=A') expected = df.reindex(columns=['A']) tm.assert_frame_equal(expected, result) # this isn't supported self.assertRaises(Exception, self.store.select, 'df1', ('columns=A', Term('index', '>', df.index[4]))) # selection on the non-indexable result = self.store.select( 'df1', ('columns=A', Term('index', '=', df.index[0:4]))) expected = df.reindex(columns=['A'], index=df.index[0:4]) tm.assert_frame_equal(expected, result) def test_ndim_indexables(self): """ test using ndim tables in new ways""" p4d = tm.makePanel4D() def check_indexers(key, indexers): for i, idx in enumerate(indexers): self.assert_( getattr( getattr(self.store.root, key).table.description, idx)._v_pos == i) # append then change (will take existing schema) indexers = ['items', 'major_axis', 'minor_axis'] self.store.remove('p4d') self.store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) self.store.append('p4d', p4d.ix[:, :, 10:, :]) tm.assert_panel4d_equal(self.store.select('p4d'), p4d) check_indexers('p4d', indexers) # same as above, but try to append with differnt axes self.store.remove('p4d') self.store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) self.store.append('p4d', p4d.ix[:, :, 10:, :], axes=['labels', 'items', 'major_axis']) tm.assert_panel4d_equal(self.store.select('p4d'), p4d) check_indexers('p4d', indexers) # pass incorrect number of axes self.store.remove('p4d') self.assertRaises(Exception, self.store.append, 'p4d', p4d.ix[:, :, :10, :], axes=['major_axis', 'minor_axis']) # different than default indexables #1 indexers = ['labels', 'major_axis', 'minor_axis'] self.store.remove('p4d') self.store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) self.store.append('p4d', p4d.ix[:, :, 10:, :]) tm.assert_panel4d_equal(self.store['p4d'], p4d) check_indexers('p4d', indexers) # different than default indexables #2 indexers = ['major_axis', 'labels', 'minor_axis'] self.store.remove('p4d') self.store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) self.store.append('p4d', p4d.ix[:, :, 10:, :]) tm.assert_panel4d_equal(self.store['p4d'], p4d) check_indexers('p4d', indexers) # partial selection result = self.store.select('p4d', ['labels=l1']) expected = p4d.reindex(labels=['l1']) tm.assert_panel4d_equal(result, expected) # partial selection2 result = self.store.select( 'p4d', [Term('labels=l1'), Term('items=ItemA'), Term('minor_axis=B')]) expected = p4d.reindex(labels=['l1'], items=['ItemA'], minor_axis=['B']) tm.assert_panel4d_equal(result, expected) # non-existant partial selection result = self.store.select( 'p4d', [Term('labels=l1'), Term('items=Item1'), Term('minor_axis=B')]) expected = p4d.reindex(labels=['l1'], items=[], minor_axis=['B']) tm.assert_panel4d_equal(result, expected) def test_append_with_strings(self): wp = tm.makePanel() wp2 = wp.rename_axis(dict([(x, "%s_extra" % x) for x in wp.minor_axis]), axis=2) self.store.append('s1', wp, min_itemsize=20) self.store.append('s1', wp2) expected = concat([wp, wp2], axis=2) expected = expected.reindex(minor_axis=sorted(expected.minor_axis)) tm.assert_panel_equal(self.store['s1'], expected) # test dict format self.store.append('s2', wp, min_itemsize={'minor_axis': 20}) self.store.append('s2', wp2) expected = concat([wp, wp2], axis=2) expected = expected.reindex(minor_axis=sorted(expected.minor_axis)) tm.assert_panel_equal(self.store['s2'], expected) # apply the wrong field (similar to #1) self.store.append('s3', wp, min_itemsize={'major_axis': 20}) self.assertRaises(Exception, self.store.append, 's3') # test truncation of bigger strings self.store.append('s4', wp) self.assertRaises(Exception, self.store.append, 's4', wp2) # avoid truncation on elements df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) self.store.append('df_big', df, min_itemsize={'values': 1024}) tm.assert_frame_equal(self.store.select('df_big'), df) # appending smaller string ok df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']]) self.store.append('df_big', df2) expected = concat([df, df2]) tm.assert_frame_equal(self.store.select('df_big'), expected) # avoid truncation on elements df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) self.store.append('df_big2', df, min_itemsize={'values': 10}) tm.assert_frame_equal(self.store.select('df_big2'), df) # bigger string on next append self.store.append('df_new', df, min_itemsize={'values': 16}) df_new = DataFrame([[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) self.assertRaises(Exception, self.store.append, 'df_new', df_new) def test_create_table_index(self): wp = tm.makePanel() self.store.append('p5', wp) self.store.create_table_index('p5') assert ( self.store.handle.root.p5.table.cols.major_axis.is_indexed == True) assert (self.store.handle.root.p5.table.cols.minor_axis.is_indexed == False) # default optlevels assert (self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 6) assert (self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') # let's change the indexing scheme self.store.create_table_index('p5') assert (self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 6) assert (self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') self.store.create_table_index('p5', optlevel=9) assert (self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 9) assert (self.store.handle.root.p5.table.cols.major_axis.index.kind == 'medium') self.store.create_table_index('p5', kind='full') assert (self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 9) assert (self.store.handle.root.p5.table.cols.major_axis.index.kind == 'full') self.store.create_table_index('p5', optlevel=1, kind='light') assert (self.store.handle.root.p5.table.cols.major_axis.index.optlevel == 1) assert (self.store.handle.root.p5.table.cols.major_axis.index.kind == 'light') df = tm.makeTimeDataFrame() self.store.append('f', df[:10]) self.store.append('f', df[10:]) self.store.create_table_index('f') # try to index a non-table self.store.put('f2', df) self.assertRaises(Exception, self.store.create_table_index, 'f2') # try to change the version supports flag from pandas.io import pytables pytables._table_supports_index = False self.assertRaises(Exception, self.store.create_table_index, 'f') # test out some versions original = tables.__version__ for v in ['2.2', '2.2b']: pytables._table_mod = None pytables._table_supports_index = False tables.__version__ = v self.assertRaises(Exception, self.store.create_table_index, 'f') for v in ['2.3.1', '2.3.1b', '2.4dev', '2.4', original]: pytables._table_mod = None pytables._table_supports_index = False tables.__version__ = v self.store.create_table_index('f') pytables._table_mod = None pytables._table_supports_index = False tables.__version__ = original def test_big_table(self): raise nose.SkipTest('no big table') # create and write a big table wp = Panel(np.random.randn(20, 1000, 1000), items=['Item%s' % i for i in xrange(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=['E%s' % i for i in xrange(1000)]) wp.ix[:, 100:200, 300:400] = np.nan try: store = HDFStore(self.scratchpath) store._debug_memory = True store.append('wp', wp) recons = store.select('wp') finally: store.close() os.remove(self.scratchpath) def test_append_diff_item_order(self): raise nose.SkipTest('append diff item order') wp = tm.makePanel() wp1 = wp.ix[:, :10, :] wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :] self.store.put('panel', wp1, table=True) self.assertRaises(Exception, self.store.put, 'panel', wp2, append=True) def test_table_index_incompatible_dtypes(self): df1 = DataFrame({'a': [1, 2, 3]}) df2 = DataFrame({'a': [4, 5, 6]}, index=date_range('1/1/2000', periods=3)) self.store.put('frame', df1, table=True) self.assertRaises(Exception, self.store.put, 'frame', df2, table=True, append=True) def test_table_values_dtypes_roundtrip(self): df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8') self.store.append('df1', df1) assert df1.dtypes == self.store['df1'].dtypes df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8') self.store.append('df2', df2) assert df2.dtypes == self.store['df2'].dtypes # incompatible dtype self.assertRaises(Exception, self.store.append, 'df2', df1) def test_table_mixed_dtypes(self): # frame def _make_one_df(): df = tm.makeDataFrame() df['obj1'] = 'foo' df['obj2'] = 'bar' df['bool1'] = df['A'] > 0 df['bool2'] = df['B'] > 0 df['bool3'] = True df['int1'] = 1 df['int2'] = 2 return df.consolidate() df1 = _make_one_df() self.store.append('df1_mixed', df1) tm.assert_frame_equal(self.store.select('df1_mixed'), df1) # panel def _make_one_panel(): wp = tm.makePanel() wp['obj1'] = 'foo' wp['obj2'] = 'bar' wp['bool1'] = wp['ItemA'] > 0 wp['bool2'] = wp['ItemB'] > 0 wp['int1'] = 1 wp['int2'] = 2 return wp.consolidate() p1 = _make_one_panel() self.store.append('p1_mixed', p1) tm.assert_panel_equal(self.store.select('p1_mixed'), p1) # ndim def _make_one_p4d(): wp = tm.makePanel4D() wp['obj1'] = 'foo' wp['obj2'] = 'bar' wp['bool1'] = wp['l1'] > 0 wp['bool2'] = wp['l2'] > 0 wp['int1'] = 1 wp['int2'] = 2 return wp.consolidate() p4d = _make_one_p4d() self.store.append('p4d_mixed', p4d) tm.assert_panel4d_equal(self.store.select('p4d_mixed'), p4d) def test_remove(self): ts = tm.makeTimeSeries() df = tm.makeDataFrame() self.store['a'] = ts self.store['b'] = df self.store.remove('a') self.assertEquals(len(self.store), 1) tm.assert_frame_equal(df, self.store['b']) self.store.remove('b') self.assertEquals(len(self.store), 0) # pathing self.store['a'] = ts self.store['b/foo'] = df self.store.remove('foo') self.store.remove('b/foo') self.assertEquals(len(self.store), 1) self.store['a'] = ts self.store['b/foo'] = df self.store.remove('b') self.assertEquals(len(self.store), 1) # __delitem__ self.store['a'] = ts self.store['b'] = df del self.store['a'] del self.store['b'] self.assertEquals(len(self.store), 0) def test_remove_where(self): # non-existance crit1 = Term('index', '>', 'foo') self.store.remove('a', where=[crit1]) # try to remove non-table (with crit) # non-table ok (where = None) wp = tm.makePanel() self.store.put('wp', wp, table=True) self.store.remove('wp', [('minor_axis', ['A', 'D'])]) rs = self.store.select('wp') expected = wp.reindex(minor_axis=['B', 'C']) tm.assert_panel_equal(rs, expected) # empty where self.store.remove('wp') self.store.put('wp', wp, table=True) # deleted number (entire table) n = self.store.remove('wp', []) assert (n == 120) # non - empty where self.store.remove('wp') self.store.put('wp', wp, table=True) self.assertRaises(Exception, self.store.remove, 'wp', ['foo']) # selectin non-table with a where #self.store.put('wp2', wp, table=False) #self.assertRaises(Exception, self.store.remove, # 'wp2', [('column', ['A', 'D'])]) def test_remove_crit(self): wp = tm.makePanel() # group row removal date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10]) crit4 = Term('major_axis', date4) self.store.put('wp3', wp, table=True) n = self.store.remove('wp3', where=[crit4]) assert (n == 36) result = self.store.select('wp3') expected = wp.reindex(major_axis=wp.major_axis - date4) tm.assert_panel_equal(result, expected) # upper half self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] crit1 = Term('major_axis', '>', date) crit2 = Term('minor_axis', ['A', 'D']) n = self.store.remove('wp', where=[crit1]) assert (n == 56) n = self.store.remove('wp', where=[crit2]) assert (n == 32) result = self.store['wp'] expected = wp.truncate(after=date).reindex(minor=['B', 'C']) tm.assert_panel_equal(result, expected) # individual row elements self.store.put('wp2', wp, table=True) date1 = wp.major_axis[1:3] crit1 = Term('major_axis', date1) self.store.remove('wp2', where=[crit1]) result = self.store.select('wp2') expected = wp.reindex(major_axis=wp.major_axis - date1) tm.assert_panel_equal(result, expected) date2 = wp.major_axis[5] crit2 = Term('major_axis', date2) self.store.remove('wp2', where=[crit2]) result = self.store['wp2'] expected = wp.reindex(major_axis=wp.major_axis - date1 - Index([date2])) tm.assert_panel_equal(result, expected) date3 = [wp.major_axis[7], wp.major_axis[9]] crit3 = Term('major_axis', date3) self.store.remove('wp2', where=[crit3]) result = self.store['wp2'] expected = wp.reindex(major_axis=wp.major_axis - date1 - Index([date2]) - Index(date3)) tm.assert_panel_equal(result, expected) # corners self.store.put('wp4', wp, table=True) n = self.store.remove( 'wp4', where=[Term('major_axis', '>', wp.major_axis[-1])]) result = self.store.select('wp4') tm.assert_panel_equal(result, wp) def test_terms(self): wp = tm.makePanel() p4d = tm.makePanel4D() self.store.put('wp', wp, table=True) self.store.put('p4d', p4d, table=True) # some invalid terms terms = [ ['minor', ['A', 'B']], ['index', ['20121114']], ['index', ['20121114', '20121114']], ] for t in terms: self.assertRaises(Exception, self.store.select, 'wp', t) self.assertRaises(Exception, Term.__init__) self.assertRaises(Exception, Term.__init__, 'blah') self.assertRaises(Exception, Term.__init__, 'index') self.assertRaises(Exception, Term.__init__, 'index', '==') self.assertRaises(Exception, Term.__init__, 'index', '>', 5) # panel result = self.store.select( 'wp', [Term('major_axis<20000108'), Term('minor_axis', '=', ['A', 'B'])]) expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) tm.assert_panel_equal(result, expected) # p4d result = self.store.select('p4d', [ Term('major_axis<20000108'), Term('minor_axis', '=', ['A', 'B']), Term('items', '=', ['ItemA', 'ItemB']) ]) expected = p4d.truncate(after='20000108').reindex( minor=['A', 'B'], items=['ItemA', 'ItemB']) tm.assert_panel4d_equal(result, expected) # valid terms terms = [ dict(field='major_axis', op='>', value='20121114'), ('major_axis', '20121114'), ('major_axis', '>', '20121114'), (('major_axis', ['20121114', '20121114']), ), ('major_axis', datetime(2012, 11, 14)), 'major_axis>20121114', 'major_axis>20121114', 'major_axis>20121114', (('minor_axis', ['A', 'B']), ), (('minor_axis', ['A', 'B']), ), ((('minor_axis', ['A', 'B']), ), ), (('items', ['ItemA', 'ItemB']), ), ('items=ItemA'), ] for t in terms: self.store.select('wp', t) self.store.select('p4d', t) # valid for p4d only terms = [ (('labels', '=', ['l1', 'l2']), ), Term('labels', '=', ['l1', 'l2']), ] for t in terms: self.store.select('p4d', t) def test_series(self): s = tm.makeStringSeries() self._check_roundtrip(s, tm.assert_series_equal) ts = tm.makeTimeSeries() self._check_roundtrip(ts, tm.assert_series_equal) ts2 = Series(ts.index, Index(ts.index, dtype=object)) self._check_roundtrip(ts2, tm.assert_series_equal) ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) self._check_roundtrip(ts3, tm.assert_series_equal) def test_sparse_series(self): s = tm.makeStringSeries() s[3:5] = np.nan ss = s.to_sparse() self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) ss2 = s.to_sparse(kind='integer') self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) ss3 = s.to_sparse(fill_value=0) self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) def test_sparse_frame(self): s = tm.makeDataFrame() s.ix[3:5, 1:3] = np.nan s.ix[8:10, -2] = np.nan ss = s.to_sparse() self._check_double_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) ss2 = s.to_sparse(kind='integer') self._check_double_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) ss3 = s.to_sparse(fill_value=0) self._check_double_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) def test_sparse_panel(self): items = ['x', 'y', 'z'] p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) sp = p.to_sparse() self._check_double_roundtrip(sp, tm.assert_panel_equal, check_panel_type=True) sp2 = p.to_sparse(kind='integer') self._check_double_roundtrip(sp2, tm.assert_panel_equal, check_panel_type=True) sp3 = p.to_sparse(fill_value=0) self._check_double_roundtrip(sp3, tm.assert_panel_equal, check_panel_type=True) def test_float_index(self): # GH #454 index = np.random.randn(10) s = Series(np.random.randn(10), index=index) self._check_roundtrip(s, tm.assert_series_equal) def test_tuple_index(self): # GH #492 col = np.arange(10) idx = [(0., 1.), (2., 3.), (4., 5.)] data = np.random.randn(30).reshape((3, 10)) DF = DataFrame(data, index=idx, columns=col) self._check_roundtrip(DF, tm.assert_frame_equal) def test_index_types(self): values = np.random.randn(2) func = lambda l, r: tm.assert_series_equal(l, r, True, True, True) ser = Series(values, [0, 'y']) self._check_roundtrip(ser, func) ser = Series(values, [datetime.today(), 0]) self._check_roundtrip(ser, func) ser = Series(values, ['y', 0]) self._check_roundtrip(ser, func) from datetime import date ser = Series(values, [date.today(), 'a']) self._check_roundtrip(ser, func) ser = Series(values, [1.23, 'b']) self._check_roundtrip(ser, func) ser = Series(values, [1, 1.53]) self._check_roundtrip(ser, func) ser = Series(values, [1, 5]) self._check_roundtrip(ser, func) ser = Series(values, [datetime(2012, 1, 1), datetime(2012, 1, 2)]) self._check_roundtrip(ser, func) def test_timeseries_preepoch(self): if sys.version_info[0] == 2 and sys.version_info[1] < 7: raise nose.SkipTest dr = bdate_range('1/1/1940', '1/1/1960') ts = Series(np.random.randn(len(dr)), index=dr) try: self._check_roundtrip(ts, tm.assert_series_equal) except OverflowError: raise nose.SkipTest('known failer on some windows platforms') def test_frame(self): df = tm.makeDataFrame() # put in some random NAs df.values[0, 0] = np.nan df.values[5, 3] = np.nan self._check_roundtrip_table(df, tm.assert_frame_equal) self._check_roundtrip(df, tm.assert_frame_equal) self._check_roundtrip_table(df, tm.assert_frame_equal, compression=True) self._check_roundtrip(df, tm.assert_frame_equal, compression=True) tdf = tm.makeTimeDataFrame() self._check_roundtrip(tdf, tm.assert_frame_equal) self._check_roundtrip(tdf, tm.assert_frame_equal, compression=True) # not consolidated df['foo'] = np.random.randn(len(df)) self.store['df'] = df recons = self.store['df'] self.assert_(recons._data.is_consolidated()) # empty self._check_roundtrip(df[:0], tm.assert_frame_equal) def test_empty_series_frame(self): s0 = Series() s1 = Series(name='myseries') df0 = DataFrame() df1 = DataFrame(index=['a', 'b', 'c']) df2 = DataFrame(columns=['d', 'e', 'f']) self._check_roundtrip(s0, tm.assert_series_equal) self._check_roundtrip(s1, tm.assert_series_equal) self._check_roundtrip(df0, tm.assert_frame_equal) self._check_roundtrip(df1, tm.assert_frame_equal) self._check_roundtrip(df2, tm.assert_frame_equal) def test_can_serialize_dates(self): rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')] frame = DataFrame(np.random.randn(len(rng), 4), index=rng) self._check_roundtrip(frame, tm.assert_frame_equal) def test_timezones(self): rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') frame = DataFrame(np.random.randn(len(rng), 4), index=rng) try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] self.assert_(recons.index.equals(rng)) self.assertEquals(rng.tz, recons.index.tz) finally: store.close() os.remove(self.scratchpath) def test_fixed_offset_tz(self): rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00') frame = DataFrame(np.random.randn(len(rng), 4), index=rng) try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] self.assert_(recons.index.equals(rng)) self.assertEquals(rng.tz, recons.index.tz) finally: store.close() os.remove(self.scratchpath) def test_store_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo', 'bar']) frame = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) self._check_roundtrip(frame, tm.assert_frame_equal) self._check_roundtrip(frame.T, tm.assert_frame_equal) self._check_roundtrip(frame['A'], tm.assert_series_equal) # check that the names are stored try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] assert (recons.index.names == ['foo', 'bar']) finally: store.close() os.remove(self.scratchpath) def test_store_index_name(self): df = tm.makeDataFrame() df.index.name = 'foo' try: store = HDFStore(self.scratchpath) store['frame'] = df recons = store['frame'] assert (recons.index.name == 'foo') finally: store.close() os.remove(self.scratchpath) def test_store_series_name(self): df = tm.makeDataFrame() series = df['A'] try: store = HDFStore(self.scratchpath) store['series'] = series recons = store['series'] assert (recons.name == 'A') finally: store.close() os.remove(self.scratchpath) def test_store_mixed(self): def _make_one(): df = tm.makeDataFrame() df['obj1'] = 'foo' df['obj2'] = 'bar' df['bool1'] = df['A'] > 0 df['bool2'] = df['B'] > 0 df['int1'] = 1 df['int2'] = 2 return df.consolidate() df1 = _make_one() df2 = _make_one() self._check_roundtrip(df1, tm.assert_frame_equal) self._check_roundtrip(df2, tm.assert_frame_equal) self.store['obj'] = df1 tm.assert_frame_equal(self.store['obj'], df1) self.store['obj'] = df2 tm.assert_frame_equal(self.store['obj'], df2) # check that can store Series of all of these types self._check_roundtrip(df1['obj1'], tm.assert_series_equal) self._check_roundtrip(df1['bool1'], tm.assert_series_equal) self._check_roundtrip(df1['int1'], tm.assert_series_equal) # try with compression self._check_roundtrip(df1['obj1'], tm.assert_series_equal, compression=True) self._check_roundtrip(df1['bool1'], tm.assert_series_equal, compression=True) self._check_roundtrip(df1['int1'], tm.assert_series_equal, compression=True) self._check_roundtrip(df1, tm.assert_frame_equal, compression=True) def test_wide(self): wp = tm.makePanel() self._check_roundtrip(wp, tm.assert_panel_equal) def test_wide_table(self): wp = tm.makePanel() self._check_roundtrip_table(wp, tm.assert_panel_equal) def test_wide_table_dups(self): wp = tm.makePanel() try: store = HDFStore(self.scratchpath) store._quiet = True store.put('panel', wp, table=True) store.put('panel', wp, table=True, append=True) recons = store['panel'] tm.assert_panel_equal(recons, wp) finally: store.close() os.remove(self.scratchpath) def test_long(self): def _check(left, right): tm.assert_panel_equal(left.to_panel(), right.to_panel()) wp = tm.makePanel() self._check_roundtrip(wp.to_frame(), _check) # empty # self._check_roundtrip(wp.to_frame()[:0], _check) def test_longpanel(self): pass def test_overwrite_node(self): self.store['a'] = tm.makeTimeDataFrame() ts = tm.makeTimeSeries() self.store['a'] = ts tm.assert_series_equal(self.store['a'], ts) def test_select(self): wp = tm.makePanel() # put/select ok self.store.remove('wp') self.store.put('wp', wp, table=True) self.store.select('wp') # non-table ok (where = None) self.store.remove('wp') self.store.put('wp2', wp, table=False) self.store.select('wp2') # selection on the non-indexable with a large number of columns wp = Panel(np.random.randn(100, 100, 100), items=['Item%03d' % i for i in xrange(100)], major_axis=date_range('1/1/2000', periods=100), minor_axis=['E%03d' % i for i in xrange(100)]) self.store.remove('wp') self.store.append('wp', wp) items = ['Item%03d' % i for i in xrange(80)] result = self.store.select('wp', Term('items', items)) expected = wp.reindex(items=items) tm.assert_panel_equal(expected, result) # selectin non-table with a where #self.assertRaises(Exception, self.store.select, # 'wp2', ('column', ['A', 'D'])) def test_panel_select(self): wp = tm.makePanel() self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] crit1 = ('major_axis', '>=', date) crit2 = ('minor_axis', '=', ['A', 'D']) result = self.store.select('wp', [crit1, crit2]) expected = wp.truncate(before=date).reindex(minor=['A', 'D']) tm.assert_panel_equal(result, expected) result = self.store.select( 'wp', ['major_axis>=20000124', ('minor_axis', '=', ['A', 'B'])]) expected = wp.truncate(before='20000124').reindex(minor=['A', 'B']) tm.assert_panel_equal(result, expected) def test_frame_select(self): df = tm.makeTimeDataFrame() self.store.put('frame', df, table=True) date = df.index[len(df) // 2] crit1 = ('index', '>=', date) crit2 = ('columns', ['A', 'D']) crit3 = ('columns', 'A') result = self.store.select('frame', [crit1, crit2]) expected = df.ix[date:, ['A', 'D']] tm.assert_frame_equal(result, expected) result = self.store.select('frame', [crit3]) expected = df.ix[:, ['A']] tm.assert_frame_equal(result, expected) # other indicies for a frame # integer df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) self.store.append('df_int', df) self.store.select( 'df_int', [Term("index<10"), Term("columns", "=", ["A"])]) df = DataFrame( dict(A=np.random.rand(20), B=np.random.rand(20), index=np.arange(20, dtype='f8'))) self.store.append('df_float', df) self.store.select('df_float', [Term("index<10.0"), Term("columns", "=", ["A"])]) # can't select if not written as table #self.store['frame'] = df #self.assertRaises(Exception, self.store.select, # 'frame', [crit1, crit2]) def test_select_filter_corner(self): df = DataFrame(np.random.randn(50, 100)) df.index = ['%.3d' % c for c in df.index] df.columns = ['%.3d' % c for c in df.columns] self.store.put('frame', df, table=True) crit = Term('columns', df.columns[:75]) result = self.store.select('frame', [crit]) tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj, **kwargs) finally: store.close() os.remove(self.scratchpath) def _check_double_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj, **kwargs) store['obj'] = retrieved again = store['obj'] comparator(again, obj, **kwargs) finally: store.close() os.remove(self.scratchpath) def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store.put('obj', obj, table=True) retrieved = store['obj'] # sorted_obj = _test_sort(obj) comparator(retrieved, obj) finally: store.close() os.remove(self.scratchpath) def test_legacy_read(self): pth = curpath() store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r') store['a'] store['b'] store['c'] store['d'] store.close() def test_legacy_table_read(self): # legacy table types pth = curpath() store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r') store.select('df1') store.select('df2') store.select('wp1') # force the frame store.select('df2', typ='legacy_frame') # old version (this still throws an exception though) import warnings warnings.filterwarnings('ignore', category=IncompatibilityWarning) self.assertRaises(Exception, store.select, 'wp1', Term('minor_axis', '=', 'B')) warnings.filterwarnings('always', category=IncompatibilityWarning) store.close() def test_legacy_table_write(self): # legacy table types pth = curpath() df = tm.makeDataFrame() wp = tm.makePanel() store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'a') self.assertRaises(Exception, store.append, 'df1', df) self.assertRaises(Exception, store.append, 'wp1', wp) store.close() def test_store_datetime_fractional_secs(self): dt = datetime(2012, 1, 2, 3, 4, 5, 123456) series = Series([0], [dt]) self.store['a'] = series self.assertEquals(self.store['a'].index[0], dt) def test_tseries_indices_series(self): idx = tm.makeDateIndex(10) ser = Series(np.random.randn(len(idx)), idx) self.store['a'] = ser result = self.store['a'] assert_series_equal(result, ser) self.assertEquals(type(result.index), type(ser.index)) self.assertEquals(result.index.freq, ser.index.freq) idx = tm.makePeriodIndex(10) ser = Series(np.random.randn(len(idx)), idx) self.store['a'] = ser result = self.store['a'] assert_series_equal(result, ser) self.assertEquals(type(result.index), type(ser.index)) self.assertEquals(result.index.freq, ser.index.freq) def test_tseries_indices_frame(self): idx = tm.makeDateIndex(10) df = DataFrame(np.random.randn(len(idx), 3), index=idx) self.store['a'] = df result = self.store['a'] assert_frame_equal(result, df) self.assertEquals(type(result.index), type(df.index)) self.assertEquals(result.index.freq, df.index.freq) idx = tm.makePeriodIndex(10) df = DataFrame(np.random.randn(len(idx), 3), idx) self.store['a'] = df result = self.store['a'] assert_frame_equal(result, df) self.assertEquals(type(result.index), type(df.index)) self.assertEquals(result.index.freq, df.index.freq) def test_unicode_index(self): unicode_values = [u'\u03c3', u'\u03c3\u03c3'] s = Series(np.random.randn(len(unicode_values)), unicode_values) self._check_roundtrip(s, tm.assert_series_equal) def test_store_datetime_mixed(self): df = DataFrame({ 'a': [1, 2, 3], 'b': [1., 2., 3.], 'c': ['a', 'b', 'c'] }) ts = tm.makeTimeSeries() df['d'] = ts.index[:3] self._check_roundtrip(df, tm.assert_frame_equal) def test_cant_write_multiindex_table(self): # for now, #1848 df = DataFrame( np.random.randn(10, 4), index=[np.arange(5).repeat(2), np.tile(np.arange(2), 5)]) self.assertRaises(Exception, self.store.put, 'foo', df, table=True)
def from_hdf5(cls, fname): history_store = HDFStore(fname) for attribute in cls._store_attributes: setattr(cls, attribute, history_store[attribute]) history_store.close()
class TesttHDFStore(unittest.TestCase): path = '__test__.h5' scratchpath = '__scratch__.h5' def setUp(self): self.store = HDFStore(self.path) def tearDown(self): self.store.close() os.remove(self.path) def test_len_keys(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() self.assertEquals(len(self.store), 4) self.assert_(set(self.store.keys()) == set(['a', 'b', 'c', 'd'])) def test_repr(self): repr(self.store) self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() repr(self.store) def test_reopen_handle(self): self.store['a'] = tm.makeTimeSeries() self.store.open('w', warn=False) self.assert_(self.store.handle.isopen) self.assertEquals(len(self.store), 0) def test_flush(self): self.store['a'] = tm.makeTimeSeries() self.store.flush() def test_get(self): self.store['a'] = tm.makeTimeSeries() left = self.store.get('a') right = self.store['a'] tm.assert_series_equal(left, right) self.assertRaises(AttributeError, self.store.get, 'b') def test_put(self): ts = tm.makeTimeSeries() df = tm.makeTimeDataFrame() self.store['a'] = ts self.store['b'] = df[:10] self.store.put('c', df[:10], table=True) # not OK, not a table self.assertRaises(ValueError, self.store.put, 'b', df[10:], append=True) # node does not currently exist, test _is_table_type returns False in # this case self.assertRaises(ValueError, self.store.put, 'f', df[10:], append=True) # OK self.store.put('c', df[10:], append=True) # overwrite table self.store.put('c', df[:10], table=True, append=False) tm.assert_frame_equal(df[:10], self.store['c']) def test_put_compression(self): df = tm.makeTimeDataFrame() self.store.put('c', df, table=True, compression='zlib') tm.assert_frame_equal(self.store['c'], df) # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, table=False, compression='zlib') def test_put_compression_blosc(self): tm.skip_if_no_package('tables', '2.2', app='blosc support') df = tm.makeTimeDataFrame() # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, table=False, compression='blosc') self.store.put('c', df, table=True, compression='blosc') tm.assert_frame_equal(self.store['c'], df) def test_put_integer(self): # non-date, non-string index df = DataFrame(np.random.randn(50, 100)) self._check_roundtrip(df, tm.assert_frame_equal) def test_append(self): df = tm.makeTimeDataFrame() self.store.put('c', df[:10], table=True) self.store.append('c', df[10:]) tm.assert_frame_equal(self.store['c'], df) def test_append_diff_item_order(self): wp = tm.makePanel() wp1 = wp.ix[:, :10, :] wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :] self.store.put('panel', wp1, table=True) self.assertRaises(Exception, self.store.put, 'panel', wp2, append=True) def test_remove(self): ts = tm.makeTimeSeries() df = tm.makeDataFrame() self.store['a'] = ts self.store['b'] = df self.store.remove('a') self.assertEquals(len(self.store), 1) tm.assert_frame_equal(df, self.store['b']) self.store.remove('b') self.assertEquals(len(self.store), 0) def test_remove_where_not_exist(self): crit1 = { 'field' : 'index', 'op' : '>', 'value' : 'foo' } self.store.remove('a', where=[crit1]) def test_remove_crit(self): wp = tm.makePanel() self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] crit1 = { 'field' : 'index', 'op' : '>', 'value' : date } crit2 = { 'field' : 'column', 'value' : ['A', 'D'] } self.store.remove('wp', where=[crit1]) self.store.remove('wp', where=[crit2]) result = self.store['wp'] expected = wp.truncate(after=date).reindex(minor=['B', 'C']) tm.assert_panel_equal(result, expected) def test_series(self): s = tm.makeStringSeries() self._check_roundtrip(s, tm.assert_series_equal) ts = tm.makeTimeSeries() self._check_roundtrip(ts, tm.assert_series_equal) def test_float_index(self): # GH #454 index = np.random.randn(10) s = Series(np.random.randn(10), index=index) self._check_roundtrip(s, tm.assert_series_equal) def test_tuple_index(self): # GH #492 col = np.arange(10) idx = [(0.,1.), (2., 3.), (4., 5.)] data = np.random.randn(30).reshape((3, 10)) DF = DataFrame(data, index=idx, columns=col) self._check_roundtrip(DF, tm.assert_frame_equal) def test_timeseries_preepoch(self): if sys.version_info[0] == 2 and sys.version_info[1] < 7: raise nose.SkipTest dr = DateRange('1/1/1940', '1/1/1960') ts = Series(np.random.randn(len(dr)), index=dr) try: self._check_roundtrip(ts, tm.assert_series_equal) except OverflowError: raise nose.SkipTest('known failer on some windows platforms') def test_frame(self): df = tm.makeDataFrame() # put in some random NAs df.values[0, 0] = np.nan df.values[5, 3] = np.nan self._check_roundtrip_table(df, tm.assert_frame_equal) self._check_roundtrip(df, tm.assert_frame_equal) self._check_roundtrip_table(df, tm.assert_frame_equal, compression=True) self._check_roundtrip(df, tm.assert_frame_equal, compression=True) tdf = tm.makeTimeDataFrame() self._check_roundtrip(tdf, tm.assert_frame_equal) self._check_roundtrip(tdf, tm.assert_frame_equal, compression=True) # not consolidated df['foo'] = np.random.randn(len(df)) self.store['df'] = df recons = self.store['df'] self.assert_(recons._data.is_consolidated()) # empty self.assertRaises(ValueError, self._check_roundtrip, df[:0], tm.assert_frame_equal) def test_can_serialize_dates(self): rng = [x.date() for x in DateRange('1/1/2000', '1/30/2000')] frame = DataFrame(np.random.randn(len(rng), 4), index=rng) self._check_roundtrip(frame, tm.assert_frame_equal) def test_store_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo', 'bar']) frame = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) self._check_roundtrip(frame, tm.assert_frame_equal) self._check_roundtrip(frame.T, tm.assert_frame_equal) self._check_roundtrip(frame['A'], tm.assert_series_equal) # check that the names are stored try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] assert(recons.index.names == ['foo', 'bar']) finally: store.close() os.remove(self.scratchpath) def test_store_index_name(self): df = tm.makeDataFrame() df.index.name = 'foo' try: store = HDFStore(self.scratchpath) store['frame'] = df recons = store['frame'] assert(recons.index.name == 'foo') finally: store.close() os.remove(self.scratchpath) def test_store_series_name(self): df = tm.makeDataFrame() series = df['A'] try: store = HDFStore(self.scratchpath) store['series'] = series recons = store['series'] assert(recons.name == 'A') finally: store.close() os.remove(self.scratchpath) def test_store_mixed(self): def _make_one(): df = tm.makeDataFrame() df['obj1'] = 'foo' df['obj2'] = 'bar' df['bool1'] = df['A'] > 0 df['bool2'] = df['B'] > 0 df['int1'] = 1 df['int2'] = 2 return df.consolidate() df1 = _make_one() df2 = _make_one() self._check_roundtrip(df1, tm.assert_frame_equal) self._check_roundtrip(df2, tm.assert_frame_equal) self.store['obj'] = df1 tm.assert_frame_equal(self.store['obj'], df1) self.store['obj'] = df2 tm.assert_frame_equal(self.store['obj'], df2) # storing in Table not yet supported self.assertRaises(Exception, self.store.put, 'foo', df1, table=True) # check that can store Series of all of these types self._check_roundtrip(df1['obj1'], tm.assert_series_equal) self._check_roundtrip(df1['bool1'], tm.assert_series_equal) self._check_roundtrip(df1['int1'], tm.assert_series_equal) # try with compression self._check_roundtrip(df1['obj1'], tm.assert_series_equal, compression=True) self._check_roundtrip(df1['bool1'], tm.assert_series_equal, compression=True) self._check_roundtrip(df1['int1'], tm.assert_series_equal, compression=True) self._check_roundtrip(df1, tm.assert_frame_equal, compression=True) def test_wide(self): wp = tm.makePanel() self._check_roundtrip(wp, tm.assert_panel_equal) def test_wide_table(self): wp = tm.makePanel() self._check_roundtrip_table(wp, tm.assert_panel_equal) def test_wide_table_dups(self): wp = tm.makePanel() try: store = HDFStore(self.scratchpath) store._quiet = True store.put('panel', wp, table=True) store.put('panel', wp, table=True, append=True) recons = store['panel'] tm.assert_panel_equal(recons, wp) finally: store.close() os.remove(self.scratchpath) def test_long(self): def _check(left, right): tm.assert_panel_equal(left.to_panel(), right.to_panel()) wp = tm.makePanel() self._check_roundtrip(wp.to_frame(), _check) # empty self.assertRaises(ValueError, self._check_roundtrip, wp.to_frame()[:0], _check) def test_longpanel(self): pass def test_overwrite_node(self): self.store['a'] = tm.makeTimeDataFrame() ts = tm.makeTimeSeries() self.store['a'] = ts tm.assert_series_equal(self.store['a'], ts) def test_panel_select(self): wp = tm.makePanel() self.store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] crit1 = { 'field' : 'index', 'op' : '>=', 'value' : date } crit2 = { 'field' : 'column', 'value' : ['A', 'D'] } result = self.store.select('wp', [crit1, crit2]) expected = wp.truncate(before=date).reindex(minor=['A', 'D']) tm.assert_panel_equal(result, expected) def test_frame_select(self): df = tm.makeTimeDataFrame() self.store.put('frame', df, table=True) date = df.index[len(df) // 2] crit1 = { 'field' : 'index', 'op' : '>=', 'value' : date } crit2 = { 'field' : 'column', 'value' : ['A', 'D'] } crit3 = { 'field' : 'column', 'value' : 'A' } result = self.store.select('frame', [crit1, crit2]) expected = df.ix[date:, ['A', 'D']] tm.assert_frame_equal(result, expected) result = self.store.select('frame', [crit3]) expected = df.ix[:, ['A']] tm.assert_frame_equal(result, expected) # can't select if not written as table self.store['frame'] = df self.assertRaises(Exception, self.store.select, 'frame', [crit1, crit2]) def test_select_filter_corner(self): df = DataFrame(np.random.randn(50, 100)) df.index = ['%.3d' % c for c in df.index] df.columns = ['%.3d' % c for c in df.columns] self.store.put('frame', df, table=True) crit = { 'field' : 'column', 'value' : df.columns[:75] } result = self.store.select('frame', [crit]) tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) def _check_roundtrip(self, obj, comparator, compression=False): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj) finally: store.close() os.remove(self.scratchpath) def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store.put('obj', obj, table=True) retrieved = store['obj'] sorted_obj = _test_sort(obj) comparator(retrieved, sorted_obj) finally: store.close() os.remove(self.scratchpath) def test_legacy_read(self): pth = curpath() store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r') store['a'] store['b'] store['c'] store['d'] store.close() def test_store_datetime_fractional_secs(self): dt = datetime(2012, 1, 2, 3, 4, 5, 123456) series = Series([0], [dt]) self.store['a'] = series self.assertEquals(self.store['a'].index[0], dt)
def append_main_summary(newcols, storename_to_append, new_store_position_colname, new_store_glyglyseq_colname): """ This functions only appends data into summary dataframe. Data - comliant columns to main dataframe but with sequences which are not present in main dataframe :param newcols: column names in the main summary :param storename_to_append: store name to open and analyze :param new_store_position_colname: Position values column name in the new store :param new_store_glyglyseq_colname: Sequences values column name in the new store :return: appends main summary table with data, saves changes in the same dataframe in HDF store """ # Open summary from hdf store = HDFStore('_data_/ProteinDataStore.h5') data_summary = store['DataBases_Summary'] what_to_append = store[storename_to_append] # Get GlyGly values all_glygly_in_newstore = what_to_append[new_store_glyglyseq_colname].values all_glygly_in_summary = data_summary['GlyGly (K) Probabilities'].values # Find new sequences to add to the summary newcomer_seqs = [] for x in all_glygly_in_newstore: clear_seq = clear_sequence(x) if clear_seq not in all_glygly_in_summary: newcomer_seqs.append(clear_seq) print len(newcomer_seqs), ' new sequences were found in ', storename_to_append # Clean them as well clean_newcomer_seqs = map(lambda x: re.sub(r'[^A-Z]', '', x), newcomer_seqs) # Find comliant positions subset_index = what_to_append[new_store_glyglyseq_colname].isin( newcomer_seqs) # fetch subset where newcomers presents positions = what_to_append[subset_index][new_store_position_colname].values # BlastP query results blastpID_HUMAN, blastpID_RODENTS = fetch_indentity_from_local_batch(clean_newcomer_seqs) # convert to pandas series positions = pd.Series(positions) newcomer_seqs = pd.Series(newcomer_seqs) clean_newcomer_seqs = pd.Series(clean_newcomer_seqs) blastpID_HUMAN = pd.Series(blastpID_HUMAN) blastpID_RODENTS = pd.Series(blastpID_RODENTS) # Create empty dataframe to be appended data_summary_appendix = pd.DataFrame(columns=newcols) # Combine everything required in dataframe data_summary_appendix['Position'] = positions data_summary_appendix['GlyGly (K) Probabilities'] = newcomer_seqs data_summary_appendix['GlyGly Probabilities'] = clean_newcomer_seqs data_summary_appendix['SP_ID_BLASTP_HUMAN'] = blastpID_HUMAN data_summary_appendix['SP_ID_BLASTP_RODENTS'] = blastpID_RODENTS # Append main DataBases_Summary data_summary = data_summary.append(data_summary_appendix) # Save to HDF store store['DataBases_Summary'] = data_summary store.close()
def downloadCSV(self, startdate, waiting_time, download_dir, DaysTillStore): chrome_options = webdriver.ChromeOptions() preferences = { "download.default_directory": download_dir, "directory_upgrade": True, "safebrowsing.enabled": True, "extensions_to_open": "" } chrome_options.add_experimental_option("prefs", preferences) driver = webdriver.Chrome(chrome_options=chrome_options) patentfile = self.cwd + '\\PatentScraper\\patents.h5' Patent_info = pd.DataFrame(data=None, columns=[ "id", "title", "assignee", "inventor/author", "priority date", "filing/creation date", "publication date", "grant date", "result link" ]) for i in range(100000): time_delta_before = datetime.timedelta(days=i) time_delta_after = datetime.timedelta(days=i + 1) Date_before = startdate - time_delta_before Date_before = Date_before.strftime("%Y%m%d") Date_after = startdate - time_delta_after Date_after = Date_after.strftime("%Y%m%d") # Das lass ich so drin, weil UNSERE Ladegeschwindigkeit nicht der einschränkende Faktor ist 'check internet connection; only continue if connection is given' connection = 0 while connection < 1: try: urllib.request.urlopen('http://www.python.org/') #return True connection = 1 except: connection = 0 time.sleep(3) print('no connection') continue driver_connection = 0 'check if webdriver is still running; only continue if functionality is given' while driver_connection < 1: try: driver.get('file:///C:/') driver_connection = 1 except: driver_connection = 0 time.sleep(3) print('driver crashed') driver = webdriver.Chrome() continue url = 'https://patents.google.com/xhr/query?url=before%3Dfiling%3A' + str( Date_before) + '%26after%3Dfiling%3A' + str( Date_after) + '&exp=&download=true' print(i, Date_before) driver.get(url) randNR = 1 + numpy.matlib.rand(1, 1) * waiting_time time.sleep(randNR) filename = max( [download_dir + "\\" + f for f in os.listdir(download_dir)], key=os.path.getctime) shutil.move(os.path.join(download_dir, filename), str(Date_before) + "_filing.csv") Data = pd.read_csv(str(Date_before) + "_filing.csv", skiprows=(1), header=(0)) id = pd.DataFrame(Data, columns=["id"]) title = pd.DataFrame(Data, columns=["title"]) assignee = pd.DataFrame(Data, columns=["assignee"]) inventor_author = pd.DataFrame(Data, columns=["inventor/author"]) priority_date = pd.DataFrame(Data, columns=["priority date"]) filing_creation_date = pd.DataFrame( Data, columns=["filing/creation date"]) publication_date = pd.DataFrame(Data, columns=["publication date"]) grant_date = pd.DataFrame(Data, columns=["grant date"]) result_link = pd.DataFrame(Data, columns=["result link"]) Current_Patent_Content = np.concatenate( (id, title, assignee, inventor_author, priority_date, filing_creation_date, publication_date, grant_date, result_link), axis=1) df = pd.DataFrame(Current_Patent_Content, columns=[ "id", "title", "assignee", "inventor/author", "priority date", "filing/creation date", "publication date", "grant date", "result link" ]) Patent_info = Patent_info.append(df) if i == math.trunc(i / DaysTillStore) * DaysTillStore: store = HDFStore(patentfile, complevel=4) Patent_info = Patent_info[[ "id", "title", "assignee", "inventor/author", "priority date", "filing/creation date", "publication date", "grant date", "result link" ]] print("cumm", Patent_info.shape) try: Patent_info_store = store['Patent_info'] Patent_info_store = Patent_info_store.append(Patent_info) except KeyError: Patent_info_store = Patent_info print(Patent_info_store.shape) store['Patent_info'] = Patent_info_store # compress file... otherwise it will by 100ds of GB large - Compressed already in store without command file # store.close() # outfilename = self.cwd +'\\PatentScraper\\out.h5' # command = ["ptrepack", "-o", "--chunkshape=auto", "--propindexes", patentfile, outfilename] # print('Size of %s is %.2fMB' % (patentfile, float(os.stat(patentfile).st_size)/1024**2)) # if call(command) != 0: # print('Error') # else: # print('Size of %s is %.2fMB' % (outfilename, float(os.stat(outfilename).st_size)/1024**2)) # os.remove(patentfile) # os.renames(outfilename, patentfile) store.close() print( 'Size of %s is %.2fMB' % (patentfile, float(os.stat(patentfile).st_size) / 1024**2)) # Reset DataFrame Patent_info = pd.DataFrame( data=None, columns=[ "id", "title", "assignee", "inventor/author", "priority date", "filing/creation date", "publication date", "grant date", "result link" ]) print("done") driver.close()
print "Loading ABC" reader = StataReader(paths.abc) abcd = reader.read(convert_dates=False, convert_categoricals=False) abcd.id.fillna(9999, inplace=True) abcd = abcd.set_index('id') abcd.drop(abcd.loc[(abcd.RV == 1) & (abcd.R == 0)].index, inplace=True) inc = abcd.filter(regex='^inc_labor[0-9][0-9]') along = pd.wide_to_long(abcd[inc.columns].reset_index(), ['inc_labor'], i='id', j='age').sort_index() along = along.interpolate(limit=1) awide = along.unstack() awide.columns = awide.columns.droplevel(0) awide.columns = ['{}{}'.format('inc_labor', a) for a in awide.columns] abcd[awide.columns] = awide abcd = abcd.loc[:, unique_list(['R'] + cols.interpABC.keep)] print abcd print "Storing Datasets in HDF5 Format" datasets = [('psid-labor', psid), ('nlsy-labor', nlsy), ('extrap-labor', extrap), ('abc-mini', abcd)] store = HDFStore(os.path.join(paths.data, 'data.h5')) for name, d in datasets: d.to_hdf(os.path.join(paths.data, 'data.h5'), key=name) store.close()