def test_hdf_globbing(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) with tmpdir() as tdir: df.to_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', format='table') df.to_hdf(os.path.join(tdir, 'two.h5'), '/bar/data', format='table') df.to_hdf(os.path.join(tdir, 'two.h5'), '/foo/data', format='table') with dask.set_options(get=dask.get): res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 tm.assert_frame_equal(res.compute(), df) res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data', chunksize=2, start=1, stop=3) expected = pd.read_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', start=1, stop=3) tm.assert_frame_equal(res.compute(), expected) res = dd.read_hdf(os.path.join(tdir, 'two.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/foo/data', chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 + 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 3))
def test_to_hdf_lock_delays(): pytest.importorskip('tables') df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df16, 16) # adding artifichial delays to make sure last tasks finish first # that's a way to simulate last tasks finishing last def delayed_nop(i): if i[1] < 10: sleep(0.1*(10-i[1])) return i # saving to multiple hdf nodes with tmpfile() as fn: a = a.apply(delayed_nop, axis=1, columns=a.columns) a.to_hdf(fn, '/data*') out = dd.read_hdf(fn, '/data*') eq(df16, out) # saving to multiple hdf files # adding artifichial delays to make sure last tasks finish first with tmpdir() as dn: fn = os.path.join(dn, 'data*') a = a.apply(delayed_nop, axis=1, columns=a.columns) a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') eq(df16, out)
def test_to_hdf_thread(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df, 16) # test single file single node with tmpfile('h5') as fn: a.to_hdf(fn, '/data', get=dask.threaded.get) out = pd.read_hdf(fn, '/data') eq(df, out) # test multiple files single node with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data', get=dask.threaded.get) out = dd.read_hdf(fn, '/data') eq(df, out) # test single file multiple nodes with tmpfile('h5') as fn: a.to_hdf(fn, '/data*', get=dask.threaded.get) out = dd.read_hdf(fn, '/data*') eq(df, out)
def test_hdf_globbing(): pytest.importorskip("tables") df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0]) tdir = tempfile.mkdtemp() try: df.to_hdf(os.path.join(tdir, "one.h5"), "/foo/data", format="table") df.to_hdf(os.path.join(tdir, "two.h5"), "/bar/data", format="table") df.to_hdf(os.path.join(tdir, "two.h5"), "/foo/data", format="table") res = dd.read_hdf(os.path.join(tdir, "one.h5"), "/*/data", chunksize=2) assert res.npartitions == 2 tm.assert_frame_equal(res.compute(), df) res = dd.read_hdf(os.path.join(tdir, "one.h5"), "/*/data", chunksize=2, start=1, stop=3) expected = pd.read_hdf(os.path.join(tdir, "one.h5"), "/foo/data", start=1, stop=3) tm.assert_frame_equal(res.compute(), expected) res = dd.read_hdf(os.path.join(tdir, "two.h5"), "/*/data", chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, "*.h5"), "/foo/data", chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, "*.h5"), "/*/data", chunksize=2) assert res.npartitions == 2 + 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 3)) finally: shutil.rmtree(tdir)
def test_hdf_globbing(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) tdir = tempfile.mkdtemp() try: df.to_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', format='table') df.to_hdf(os.path.join(tdir, 'two.h5'), '/bar/data', format='table') df.to_hdf(os.path.join(tdir, 'two.h5'), '/foo/data', format='table') res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 tm.assert_frame_equal(res.compute(), df) res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data', chunksize=2, start=1, stop=3) expected = pd.read_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', start=1, stop=3) tm.assert_frame_equal(res.compute(), expected) res = dd.read_hdf(os.path.join(tdir, 'two.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/foo/data', chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 + 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 3)) finally: shutil.rmtree(tdir)
def test_read_hdf_multiply_open(): """Test that we can read from a file that's already opened elsewhere in read-only mode.""" pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) with tmpfile('h5') as fn: df.to_hdf(fn, '/data', format='table') with pd.HDFStore(fn, mode='r'): dd.read_hdf(fn, '/data', chunksize=2, mode='r')
def test_read_hdf(data, compare): pytest.importorskip('tables') with tmpfile('h5') as fn: data.to_hdf(fn, '/data') try: dd.read_hdf(fn, 'data', chunksize=2, mode='r') assert False except TypeError as e: assert "format='table'" in str(e) with tmpfile('h5') as fn: data.to_hdf(fn, '/data', format='table') a = dd.read_hdf(fn, '/data', chunksize=2, mode='r') assert a.npartitions == 2 compare(a.compute(), data) compare(dd.read_hdf(fn, '/data', chunksize=2, start=1, stop=3, mode='r').compute(), pd.read_hdf(fn, '/data', start=1, stop=3)) assert (sorted(dd.read_hdf(fn, '/data', mode='r').dask) == sorted(dd.read_hdf(fn, '/data', mode='r').dask)) with tmpfile('h5') as fn: sorted_data = data.sort_index() sorted_data.to_hdf(fn, '/data', format='table') a = dd.read_hdf(fn, '/data', chunksize=2, sorted_index=True, mode='r') assert a.npartitions == 2 compare(a.compute(), sorted_data)
def test_to_hdf_multiple_files(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) a = dd.from_pandas(df, 2) df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) b = dd.from_pandas(df16, 16) # saving to multiple files with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') assert_eq(df, out) # saving to multiple files making sure order is kept with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') b.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') assert_eq(df16, out) # saving to multiple files with custom name_function with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1)) out = dd.read_hdf(fn, '/data') assert_eq(df, out) out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data') tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data') tm.assert_frame_equal(out, df.iloc[2:]) # test hdf object with tmpfile('h5') as fn: with pd.HDFStore(fn) as hdf: a.to_hdf(hdf, '/data*') out = dd.read_hdf(fn, '/data*') assert_eq(df, out)
def test_to_hdf_modes_multiple_nodes(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) # appending a single partition to existing data a = dd.from_pandas(df, 1) with tmpfile('h5') as fn: a.to_hdf(fn, '/data2') a.to_hdf(fn, '/data*', mode='a') out = dd.read_hdf(fn, '/data*') eq(df.append(df), out) # overwriting a file with a single partition a = dd.from_pandas(df, 1) with tmpfile('h5') as fn: a.to_hdf(fn, '/data2') a.to_hdf(fn, '/data*', mode='w') out = dd.read_hdf(fn, '/data*') eq(df, out) # appending two partitions to existing data a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data2') a.to_hdf(fn, '/data*', mode='a') out = dd.read_hdf(fn, '/data*') eq(df.append(df), out) # overwriting a file with two partitions a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data2') a.to_hdf(fn, '/data*', mode='w') out = dd.read_hdf(fn, '/data*') eq(df, out) # overwriting a single partition, keeping other partitions a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data1') a.to_hdf(fn, '/data2') a.to_hdf(fn, '/data*', mode='a', append=False) out = dd.read_hdf(fn, '/data*') eq(df.append(df), out)
def test_read_hdf_doesnt_segfault(): with tmpfile("h5") as fn: N = 40 df = pd.DataFrame(np.random.randn(N, 3)) with pd.HDFStore(fn, mode="w") as store: store.append("/x", df) ddf = dd.read_hdf(fn, "/x", chunksize=2) assert len(ddf) == N
def test_read_hdf_doesnt_segfault(): pytest.importorskip('tables') with tmpfile('h5') as fn: N = 40 df = pd.DataFrame(np.random.randn(N, 3)) with pd.HDFStore(fn, mode='w') as store: store.append('/x', df) ddf = dd.read_hdf(fn, '/x', chunksize=2) assert len(ddf) == N
def test_read_hdf(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) with tmpfile('h5') as fn: df.to_hdf(fn, '/data') try: dd.read_hdf(fn, 'data', chunksize=2) assert False except TypeError as e: assert "format='table'" in str(e) with tmpfile('h5') as fn: df.to_hdf(fn, '/data', format='table') a = dd.read_hdf(fn, '/data', chunksize=2) assert a.npartitions == 2 assert a._known_dtype tm.assert_frame_equal(a.compute(), df) tm.assert_frame_equal( dd.read_hdf(fn, '/data', chunksize=2, start=1, stop=3).compute(), pd.read_hdf(fn, '/data', start=1, stop=3)) assert sorted(dd.read_hdf(fn, '/data').dask) == \ sorted(dd.read_hdf(fn, '/data').dask)
def test_hdf_file_list(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) with tmpdir() as tdir: df.iloc[:2].to_hdf(os.path.join(tdir, 'one.h5'), 'dataframe', format='table') df.iloc[2:].to_hdf(os.path.join(tdir, 'two.h5'), 'dataframe', format='table') with dask.set_options(get=dask.get): input_files = [os.path.join(tdir, 'one.h5'), os.path.join(tdir, 'two.h5')] res = dd.read_hdf(input_files, 'dataframe') tm.assert_frame_equal(res.compute(), df)
def test_read_hdf_multiple(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df, 16) with tmpfile('h5') as fn: a.to_hdf(fn, '/data*') r = dd.read_hdf(fn, '/data*', sorted_index=True) assert a.npartitions == r.npartitions assert a.divisions == r.divisions eq(a, r)
def load_datasets(self,outofcore): data_path = self.config['file'] print('Loading Data from {}...'.format(data_path)) if not path.isabs(data_path): config_dir = path.split(self.config_path)[0] data_path = path.join(config_dir, data_path) if not path.exists(data_path): raise IOError('Unable to find input dataset: "{}"'.format(data_path)) axes_fields = [] for f in self.axes.values(): axes_fields += [f[1], f[2]] load_fields = [f for f in self.fields.values() if f is not None] + axes_fields if data_path.endswith(".csv"): self.df = pd.read_csv(data_path, usecols=load_fields) # parse categorical fields for f in self.categorical_fields: self.df[f] = self.df[f].astype('category') elif data_path.endswith(".castra"): import dask.dataframe as dd self.df = dd.from_castra(data_path) if not outofcore: self.df = self.df.cache(cache=dict) elif data_path.endswith(".hdf"): import dask.dataframe as dd self.df = dd.read_hdf(data_path, key="census") if not outofcore: self.df = self.df.cache(cache=dict) else: raise IOError("Unknown data file type; .csv and .castra currently supported")
def test_read_hdf_start_stop_values(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) with tmpfile('h5') as fn: df.to_hdf(fn, '/data', format='table') with pytest.raises(ValueError) as e: dd.read_hdf(fn, '/data', stop=10) assert 'number of rows' in str(e) with pytest.raises(ValueError) as e: dd.read_hdf(fn, '/data', start=10) assert 'is above or equal to' in str(e) with pytest.raises(ValueError) as e: dd.read_hdf(fn, '/data', chunksize=-1) assert 'positive integer' in str(e)
def test_read_hdf(): pytest.importorskip("tables") df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0]) with tmpfile("h5") as fn: df.to_hdf(fn, "/data") try: dd.read_hdf(fn, "/data", chunksize=2) assert False except TypeError as e: assert "format='table'" in str(e) with tmpfile("h5") as fn: df.to_hdf(fn, "/data", format="table") a = dd.read_hdf(fn, "/data", chunksize=2) assert a.npartitions == 2 tm.assert_frame_equal(a.compute(), df) tm.assert_frame_equal( dd.read_hdf(fn, "/data", chunksize=2, start=1, stop=3).compute(), pd.read_hdf(fn, "/data", start=1, stop=3) ) assert sorted(dd.read_hdf(fn, "/data").dask) == sorted(dd.read_hdf(fn, "/data").dask)
def test_read_hdf(data, compare): pytest.importorskip('tables') with tmpfile('h5') as fn: data.to_hdf(fn, '/data') try: dd.read_hdf(fn, 'data', chunksize=2, mode='r') assert False except TypeError as e: assert "format='table'" in str(e) with tmpfile('h5') as fn: data.to_hdf(fn, '/data', format='table') a = dd.read_hdf(fn, '/data', chunksize=2, mode='r') assert a.npartitions == 2 compare(a.compute(), data) compare(dd.read_hdf(fn, '/data', chunksize=2, start=1, stop=3, mode='r').compute(), pd.read_hdf(fn, '/data', start=1, stop=3)) assert (sorted(dd.read_hdf(fn, '/data', mode='r').dask) == sorted(dd.read_hdf(fn, '/data', mode='r').dask))
def au_data_frame(directory: str) -> df: return df.read_hdf(os.path.join(directory, 'hdfs', 'au_*.hdf'), '/data')
help='print debugging information', action='store_true', required=False) args = parser.parse_args() finput = args.paired_data reg = args.regulatory startdate = args.startdate enddate = args.enddate species = args.species subset_giorgi = args.subset_giorgi giorgi_regions = args.giorgi_regions verbose = args.verbose for ee in giorgi_regions: df = dd.read_hdf(finput, '/*').compute() mapping_table = {'pm25_ugm3': 'sfc_pm25', 'pm10_ugm3': 'sfc_pm10'} sub_map = {i: mapping_table[i] for i in species if i in mapping_table} #subsetting data for dates, regulatory calc, and/or giorgi regions if startdate != None and enddate != None: mask = (df['time'] >= startdate) & (df['time'] <= enddate) df = df.loc[mask] import datetime startdatename_obj = datetime.datetime.strptime( startdate, '%Y-%m-%d %H:%M:%S') enddatename_obj = datetime.datetime.strptime( enddate, '%Y-%m-%d %H:%M:%S') startdatename = str( datetime.datetime.strftime(startdatename_obj, '%Y-%m-%d_%H')) enddatename = str( datetime.datetime.strftime(enddatename_obj, '%Y-%m-%d_%H'))
def time_read_hdf5(self, scheduler): (dd.read_hdf('{}/*.hdf5'.format(self.data_dir), 'key').compute(scheduler=scheduler))
del data_test # del not needed file return test_transformed if __name__ == "__main__": # init class start_time = time.time() data = LoadBigCsvFile(train, test).read_data() gc.collect() print('class loaded in %s seconds' % (time.time() - start_time)) time.sleep(1) # set some time gap # save to hdf for later use or modification start_time = time.time() data.to_hdf('test_proc.hdf', key='df1') print('file saved in hdf in %s seconds' % (time.time() - start_time)) time.sleep(1) # set some time gap print() # check the file and its content start_time = time.time() hdf_read = dask.read_hdf('test_proc.hdf', key='df1', mode='r', chunksize=10000) print('file load into system in %s seconds' % (time.time() - start_time)) print(hdf_read.head(3))
def find_scores(patient_dir: str, refresh=True): """ Finds the scores for a specific patient directory :param patient_dir: Directory to look in """ if not refresh and 'au_w_anno.hdf' in os.listdir( os.path.join(patient_dir, 'hdfs')): return try: patient, day, session = patient_day_session(patient_dir) try: au_frame = df.read_hdf(os.path.join(patient_dir, 'hdfs', 'au.hdf'), '/data') except ValueError as e: print(e) return # except ValueError as e: # print(e) # return if 'frame' not in au_frame.columns: return annotated_values = ["N/A" for _ in range(len(au_frame.index))] # here are the hand annotations csv_path = os.path.join('/home/emil/emotion_annotations', patient_dir.replace('cropped', 'emotions.csv')) # video length is the same as length of the corresponding AU file num_frames = len(annotated_values) if num_frames != len(au_frame): print('this is wrong') print(num_frames) print(len(au_frame)) exit() # find annotations, if exist. Else just leave the nans if os.path.exists(csv_path): csv_dict = csv_emotion_reader(csv_path) if csv_dict: annotated_ratio = int(num_frames / len(csv_dict)) if annotated_ratio > 1: print('HELLO HERE IS SUCH A CASE:', patient_dir) print('num_frames:', num_frames) print('len of annots:', len(csv_dict)) if annotated_ratio == 0: annotated_ratio = 1 csv_dict = { i * annotated_ratio: c for i, c in csv_dict.items() } for i in [ x for x in csv_dict.keys() if 'None' not in csv_dict[x] ]: to_write = clean_to_write(csv_dict[i]) if i in range(len(annotated_values)): annotated_values[i] = to_write # au_frame = au_frame.assign(annotated=annotated_values) # au_frame = au_frame.set_index('frame') # au_frame["annotated"] = df.from_array(da.from_array(annotated_values, chunks=5)) annotated_values = da.from_array(annotated_values, chunks='auto').compute() # what we know: au_frame['frame'] starts at 1, goes to (including) 3604 # annotated_values has length we want, but currently (with the +1) a length of 3605 # au_frame has a length of 3604 (makes sense, 1-3604) au_frame = au_frame.compute() au_frame = au_frame.assign( annotated=lambda x: annotated_values[x['frame'] - 1]) au_frame.to_hdf(os.path.join(patient_dir, 'hdfs', 'au_w_anno.hdf'), '/data', format='table') except FileNotFoundError as not_found_error: print(not_found_error) except AttributeError as e: print(e)
import pandas as pd import dask.dataframe as dd import numpy as np fname = 'testfile.h5' key = '/group/dataset' dtype = np.dtype([('idx','i4'),('val_a','f8'),('val_b','f8')]) N = 10000 cs = 1000 #store = pd.HDFStore(output_fname, mode='w') with pd.HDFStore(fname, mode='w') as store: recarray = np.empty(N, dtype) df = pd.DataFrame.from_records(recarray) store.append(key, df) df = pd.read_hdf(fname, key) ddf = dd.read_hdf(fname, key, chunksize=cs) print(len(df)) print(len(ddf))
def test_to_hdf(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:]) with tmpfile('h5') as fn: a.x.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_series_equal(df.x, out[:]) a = dd.from_pandas(df, 1) with tmpfile('h5') as fn: a.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:]) # saving to multiple datasets a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data*') out = dd.read_hdf(fn, '/data*') tm.assert_frame_equal(df, out.compute()) # saving to multiple files a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out.compute()) # saving to multiple datasets with custom name_function a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i + 1)) out = dd.read_hdf(fn, '/data_*') tm.assert_frame_equal(df, out.compute()) out = pd.read_hdf(fn, '/data_a') tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(fn, '/data_aa') tm.assert_frame_equal(out, df.iloc[2:]) # saving to multiple files with custom name_function a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1)) out = dd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out.compute()) out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data') tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data') tm.assert_frame_equal(out, df.iloc[2:]) # saving to different datasets in multiple files with custom name_function a = dd.from_pandas(df, 2) with tmpdir() as dn: with pytest.raises(ValueError): fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i + 1))
def process_vid_dir(eyebrow_dict: dict, vid_dir: str) -> None: # all_dict_file = os.path.join(vid_dir, 'all_dict.txt') patient_name = vid_dir.split('_')[0] all_dict_folder = ('all_' + patient_name) already_ran_file = os.path.join(vid_dir, 'already_ran.txt') diff_dict = json.load( open(already_ran_file)) if os.path.exists(already_ran_file) else {} if vid_dir not in diff_dict: diff_dict[vid_dir] = {} emotion_frame = df.read_hdf(os.path.join( all_dict_folder, 'hdfs' '*.hdf'), '/data') if os.path.exists( all_dict_folder) else AUScorer.au_data_frame(vid_dir) # emotion_dict = AUScorer.convert_dict_to_int( # json.load(open(all_dict_file))) if os.path.exists( # all_dict_file) else AUScorer.AUScorer(vid_dir).presence_dict include_eyebrows = eyebrow_dict and vid_dir in eyebrow_dict['Eyebrows'] pre_func_list = [(re_crop_vid_dir, 're_crop'), (throw_vid_in_reverse, 'reverse'), (reverse_re_crop_vid_dir, 'reverse_re_crop')] post_func_list = [(invert_colors, 'invert_colors'), (lower_gamma, 'low_gamma'), (increase_gamma, 'high_gamma')] dir_list = [name for _, name in pre_func_list + post_func_list] to_do_list = [x for _, x in pre_func_list if x not in diff_dict[vid_dir]] for func, name in pre_func_list + post_func_list: if name not in diff_dict[vid_dir]: post_func_frame = func(get_vid_from_dir(vid_dir), vid_dir, include_eyebrows) update_frames(post_func_frame=post_func_frame, emotion_frame=emotion_frame, diff_dict=diff_dict, vid_dir=vid_dir, name=name, func_name='as-is') for pre_dir in to_do_list: if os.path.exists(os.path.join(vid_dir, pre_dir)): if pre_dir not in diff_dict[vid_dir]: diff_dict[vid_dir][pre_dir] = {} for func, name in post_func_list: if name not in diff_dict[vid_dir][pre_dir]: full_path = os.path.join(vid_dir, pre_dir) post_func_frame = func( glob.glob(os.path.join(full_path, '*.avi'))[0], full_path, include_eyebrows) update_frames(post_func_frame, emotion_frame, diff_dict, vid_dir, pre_dir, name) # json.dump(emotion_dict, open(all_dict_file, 'w')) json.dump(diff_dict, open(already_ran_file, 'w')) for pre_dir in dir_list: if os.path.exists(os.path.join(vid_dir, pre_dir)): shutil.rmtree(os.path.join(vid_dir, pre_dir))
return (all_models_with_postValiMods) # load ensemble weights d = pickle.load( open(data_intermed_nb_fldrpath + "/postvali_ensemble_weights.p", "rb")) ranked1_ensemble_weights = d["ranked1_ensemble_weights"] ranked5_ensemble_weights = d["ranked5_ensemble_weights"] del d # create post-validation sample models on the validation sample ## load component models all_models_dd = dd.read_hdf(all_models_store_path, key="/all_models_Rehol_NotPlugged/valicast", chunksize=dd_chunksize_valicast) ## add ranked 1 and ranked 5 predictions all_models_with_postValiMods_dd = add_post_vali_mods( df_dd=all_models_dd, ensemble_weights=ranked1_ensemble_weights, ensemble_name="postValiMod_ens_ranked1") all_models_with_postValiMods_dd = add_post_vali_mods( df_dd=all_models_with_postValiMods_dd, ensemble_weights=ranked5_ensemble_weights, ensemble_name="postValiMod_ens_ranked5") all_models_with_postValiMods_dd = all_models_with_postValiMods_dd[[ "daily_level", "daily_untouched", "postValiMod_ens_ranked1",
def test_to_hdf_multiple_datasets(): df = pd.DataFrame({ 'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4] }, index=[1., 2., 3., 4.]) a = dd.from_pandas(df, 2) df16 = pd.DataFrame( { 'x': [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p' ], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] }, index=[ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. ]) b = dd.from_pandas(df16, 16) # saving to multiple datasets making sure order is kept with tmpfile('h5') as fn: b.to_hdf(fn, '/data*') out = dd.read_hdf(fn, '/data*') eq(df16, out) # saving to multiple datasets with tmpfile('h5') as fn: a.to_hdf(fn, '/data*') out = dd.read_hdf(fn, '/data*') eq(df, out) # saving to multiple files with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') eq(df, out) # saving to multiple datasets with custom name_function with tmpfile('h5') as fn: a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i + 1)) out = dd.read_hdf(fn, '/data_*') eq(df, out) out = pd.read_hdf(fn, '/data_a') tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(fn, '/data_aa') tm.assert_frame_equal(out, df.iloc[2:]) # saving to multiple files with custom name_function with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1)) out = dd.read_hdf(fn, '/data') eq(df, out) out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data') tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data') tm.assert_frame_equal(out, df.iloc[2:]) # saving to different datasets in multiple files with custom name_function with tmpdir() as dn: with pytest.raises(ValueError): fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i + 1)) # test hdf object with tmpfile('h5') as fn: with pd.HDFStore(fn) as hdf: a.to_hdf(hdf, '/data*') out = dd.read_hdf(fn, '/data*') eq(df, out)
def test_to_hdf_schedulers(scheduler, npartitions): pytest.importorskip("tables") df = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) a = dd.from_pandas(df, npartitions=npartitions) # test single file single node with tmpfile("h5") as fn: a.to_hdf(fn, "/data", scheduler=scheduler) out = pd.read_hdf(fn, "/data") assert_eq(df, out) # test multiple files single node with tmpdir() as dn: fn = os.path.join(dn, "data_*.h5") a.to_hdf(fn, "/data", scheduler=scheduler) out = dd.read_hdf(fn, "/data") assert_eq(df, out) # test single file multiple nodes with tmpfile("h5") as fn: a.to_hdf(fn, "/data*", scheduler=scheduler) out = dd.read_hdf(fn, "/data*") assert_eq(df, out)
import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, Imputer from sklearn.model_selection import LeaveOneGroupOut, cross_val_predict from zillow import modelling import pickle as pkl import dask.dataframe as dd test_df = dd.read_hdf("input/test_20172.*.hdf", "data") with open("input/feat_names.pkl", "rb") as f: feat_names = pkl.load(f) with open("input/feat_names_both.pkl", "rb") as f: feat_names_both = pkl.load(f) test_df = test_df with open("input/encoders.pkl", "rb") as f: encoders = pkl.load(f) for col, encoder in encoders.items(): print(encoder.classes_) print(test_df[col].head(1000, npartitions=10).unique()) test_df[col] = test_df.map_partitions(lambda x: encoder.transform( list(x[col].replace(np.nan, encoder.classes_[0]).values)), meta=pd.Series(dtype=np.float32)) with open("input/log_cols.pkl", "rb") as f: log_cols = pkl.load(f)
def sumTokenCounts(stores,data): max_str_bytes = 50 chunksize = 100000 batch_limit = 6*10**8 savestore = data + "final/fromnodes-323.h5" for storefile in stores: print(storefile) logging.info("Next store: %s" % storefile) try: # Get Unique languages with pd.HDFStore(storefile, complevel=9, mode="a", complib='blosc') as store: langs = set([key.split("/", maxsplit=-1)[-1] for key in store.keys() if 'merged1' in key]) except: logging.exception("Can't read languages from %s" % storefile) continue for lang in langs: batch = False logging.info("Starting lang %s from %s" % (lang, storefile)) print(lang) if not re.match('[a-z]{3}', lang): logging.error("lang '%s' is not three alphanumeric characters. Skipping for now. (%s)" % (lang, storefile)) continue try: ddf = dd.read_hdf(storefile, '/merged1/'+lang, chunksize=chunksize, mode='r') except: logging.exception("Can't load Dask DF for %s in %s" % (lang, storefile)) continue # Assuming partitions are equally sized, which they should be if read from a single file if ddf.npartitions > np.ceil(batch_limit/chunksize): batch = True niters = np.floor((ddf.npartitions*chunksize)/batch_limit) i = 0 while True: if batch: start = i * batch_limit logging.info("Starting batch %d for %s" % (i, lang)) if i == niters: # Last batch, no stop value ddf = dd.read_hdf(storefile, '/merged1/'+lang, chunksize=chunksize, start=start) batch = False else: ddf = dd.read_hdf(storefile, '/merged1/'+lang, chunksize=chunksize,start=start, stop=(start+batch_limit)) i += 1 try: logging.info("Starting full merge for %s with %d partitions" % (lang, ddf.npartitions)) with ProgressBar(): full_merge = ddf.reset_index().groupby('token').sum().compute() #if lang == 'eng': # For curiosity: see the profiling for English # prof.visualize() logging.info("Success! Saving merged.") # The /fromnodes table is the sum from all the different stores, but will need to be summed one more time with pd.HDFStore(savestore, complevel=9, mode="a", complib='blosc') as store: store.append(lang,full_merge,data_columns=['count'],min_itemsize = {'index': max_str_bytes}) except: logging.exception("Can't compute or save lang for %s in %s" % (lang, storefile)) if batch == False: break
'axes.labelsize': 'xx-large', 'axes.titlesize': 'xx-large', 'xtick.labelsize': 'xx-large', 'ytick.labelsize': 'xx-large' } pylab.rcParams.update(params) w_dir = 'D:/BIG DATA' directory_load = 'Merged Files' directory_save = 'Strategy Reversal' os.chdir(w_dir) ccys = ['EURUSD', 'EURCHF', 'EURGBP', 'EURJPY', 'EURAUD'] df = dd.read_hdf(directory_load + '/' + ccys[1] + '-' + str(2014) + '.h5', ccys[1] + str(2014) + '05') plt.figure() plt.plot(df.compute()) #%% with open(directory_save + '/Report.txt', 'r') as fout: file = fout.read() #%% lines = file.split() read_ccy = False read_PL = False performance_usd = [] performance_chf = []
def create_unigram_book_counts(self, newtable=True, ingest=True, index=True, reverse_index=True, table_count=1): import time t0 = time.time() db = self.db ngramname = "unigrams" tablenameroot = "master_bookcounts" # If you are splitting the input into multiple tables # to be joined as a merge table, come up with multiple # table names and we'll cycle through. if table_count == 1: tablenames = [tablenameroot] elif table_count > 1: tablenames = [ "%s_p%d" % (tablenameroot, i) for i in range(1, table_count + 1) ] else: logging.error("You need a positive integer for table_count") raise grampath = ".bookworm/texts/encoded/%s" % ngramname tmpdir = "%s/tmp" % grampath if (len(grampath) == 0) or (grampath == "/"): logging.error( "Woah! Don't set the ngram path to your system root!") raise if newtable: if os.path.exists(tmpdir): import shutil shutil.rmtree(tmpdir) logging.info("Dropping older %s table, if it exists" % ngramname) for tablename in tablenames: db.query("DROP TABLE IF EXISTS " + tablename) logging.info("Making a SQL table to hold the %s" % ngramname) reverse_index_sql = "INDEX(bookid,wordid,count), " if reverse_index else "" for tablename in tablenames: db.query( "CREATE TABLE IF NOT EXISTS " + tablename + " (" "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql + "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), " "count MEDIUMINT UNSIGNED NOT NULL);") if ingest: for tablename in tablenames: db.query("ALTER TABLE " + tablename + " DISABLE KEYS") db.query("set NAMES utf8;") db.query("set CHARACTER SET utf8;") logging.info("loading data using LOAD DATA LOCAL INFILE") files = os.listdir(grampath) for i, filename in enumerate(files): if filename.endswith('.txt'): # With each input file, cycle through each table in tablenames tablename = tablenames[i % len(tablenames)] logging.debug("Importing txt file, %s (%d/%d)" % (filename, i, len(files))) try: db.query("LOAD DATA LOCAL INFILE '" + grampath + "/" + filename + "' INTO TABLE " + tablename + " CHARACTER SET utf8 (bookid,wordid,count);") except KeyboardInterrupt: raise except: logging.debug( "Falling back on insert without LOCAL DATA INFILE. Slower." ) try: import pandas as pd df = pd.read_csv(grampath + "/" + filename, sep='\t', header=None) to_insert = df.apply(tuple, axis=1).tolist() db.query("INSERT INTO " + tablename + " (bookid,wordid,count) " "VALUES (%s, %s, %s);" "", many_params=to_insert) except KeyboardInterrupt: raise except: logging.exception("Error inserting %s from %s" % (ngramname, filename)) continue elif filename.endswith('.h5'): logging.info("Importing h5 file, %s (%d/%d)" % (filename, i, len(files))) try: # When encountering an .h5 file, this looks for ngram information # in a /#{ngramnames} table (e.g. /unigrams) and writes it out to # temporary TSV files. # Dask is used here simply because it's a dead simple way to multithread # the TSV writing and lower the overhead versus having a TSV already staged. import csv import pandas as pd try: import dask.dataframe as dd except: logging.exception( "Ingesting h5 files requires dask") try: os.makedirs(tmpdir) except OSError: if not os.path.isdir(tmpdir): raise # Dask will use #{n_cores-1} threads when saving CSVs. # Ingest and key reload times are identical to txt import, so the only # additional overhead is reading the file (small effect) and writing the csv. ddf = dd.read_hdf(grampath + "/" + filename, ngramname, mode='r', chunksize=2000000) ddf.reset_index().to_csv(tmpdir + '/tmp.*.tsv', index=False, sep='\t', header=False, quoting=csv.QUOTE_NONNUMERIC) logging.info( "CSV written from H5. Time passed: %.2f s" % (time.time() - t0)) for j, tmpfile in enumerate(os.listdir(tmpdir)): # With each input file, cycle through each table in tablenames tablename = tablenames[j % len(tablenames)] path = "%s/%s" % (tmpdir, tmpfile) db.query( "LOAD DATA LOCAL INFILE '" + path + "' " "INTO TABLE " + tablename + " " "CHARACTER SET utf8 (bookid,wordid,count);") try: os.remove(path) except: pass logging.info("CSVs input. Time passed: %.2f s" % (time.time() - t0)) except KeyboardInterrupt: raise except: logging.exception("Error inserting %s from %s" % (ngramname, filename)) continue else: continue if index: logging.info("Creating Unigram Indexes. Time passed: %.2f s" % (time.time() - t0)) for tablename in tablenames: db.query("ALTER TABLE " + tablename + " ENABLE KEYS") if table_count > 1: logging.info("Creating a merge table for " + ",".join(tablenames)) db.query( "CREATE TABLE IF NOT EXISTS " + tablenameroot + " (" "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql + "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), " "count MEDIUMINT UNSIGNED NOT NULL) " "ENGINE=MERGE UNION=(" + ",".join(tablenames) + ") INSERT_METHOD=LAST;") logging.info("Unigram index created in: %.2f s" % ((time.time() - t0)))
def score(self, spaceagg, timeagg, store_minimum=False, pp_model=None, quantile=''): """ Read the obs and clim. make a comparison object which computes the scores in the dask dataframe. This dask dataframe is exported. Returns a list with intermediate filenames of the raw, climatological and corrected scores. Has a post-processing step if the pp_model is supplied. Fit is the same regardless of the quantile, so done only once. If there are no quantiles to predict or binary variable, we force equidistant sampling (random = True led to overestimations of the crps) """ alignment = ForecastToObsAlignment(season=self.season, cycle=self.cycle) alignment.recollect(booksname=self.log.loc[(spaceagg, timeagg), ('booksname', '')]) climatology = Climatology( self.basevar, **{ 'name': self.log.loc[(spaceagg, timeagg), ('climname', quantile)] }) climatology.localclim( ) # loading in this case. Creation was done in the makeclim method. if not self.log.loc[(spaceagg, timeagg), ( 'modelclimname', [quantile] )].isna().any( ): # Supply model quantile climatology if that was computed earlier. Will be preferred for the raw briescoring in the comparison Class modelclimatology = ModelClimatology( cycle=self.cycle, variable=self.basevar, **{ 'name': self.log.loc[(spaceagg, timeagg), ('modelclimname', quantile)] }) modelclimatology.local_clim() assert self.newvar == 'anom', 'This modelclimatology has likely no adapted units, only when anomalies the quantiles in Kelvin will be compatible with the aligned forecast anomalies in Celsius.' else: modelclimatology = None comp = Comparison(alignment=alignment, climatology=climatology, modelclimatology=modelclimatology) # Fitting or accepting external fits (meaning the column is already filled): if not pp_model is None: if not isinstance( self.log.loc[(spaceagg, timeagg), ('externalfits', quantile)], str): comp.fit_pp_models(pp_model=pp_model, groupers=['leadtime', 'clustid']) firstfitname = comp.export(fits=True, frame=False) self.log.loc[(spaceagg, timeagg), ( 'externalfits', slice(None) )] = firstfitname # Specifically useful for the looping over quantiles. else: fitname = self.log.loc[(spaceagg, timeagg), ('externalfits', quantile)] print('loading fit from:', fitname) comp.fits = dd.read_hdf( comp.basedir + fitname + '.h5', key='fits') # Loading of the fits of the first quantile. comp.fitgroupers = ['leadtime', 'clustid'] # Going to the scoring. if isinstance(quantile, float): if not pp_model is None: comp.make_pp_forecast(pp_model=pp_model) comp.brierscore() else: if not pp_model is None: comp.make_pp_forecast(pp_model=pp_model, random=False, n_members=self.ndraws if isinstance( pp_model, NGR) else None) comp.export(fits=False, frame=False, preds=True) if (self.newvar is None) or (self.newvar == 'anom'): comp.crpsscore() else: # Meaning a custom binary predictand comp.brierscore() scorefile = comp.export(fits=False, frame=True, store_minimum=store_minimum) return (scorefile)
def h5_load_range_by_coord(db_path, table, range_coordinates: Optional[Sequence] = None, columns=None, chunksize=None, sorted_index=None, **kwargs) -> dd.DataFrame: """ Load (range by intenger indexes of) hdf5 data to dask dataframe :param range_coordinates: control/limit range of data loading: tuple of int, start and end indexes - limit returned dask dataframe by this range empty tuple - raise Ex_nothing_done None, to load all data :param cfg_in: dict, with fields: :param db_path, str :param table, str dask.read_hdf() parameters: :param chunksize, :param sorted_index (optional): bool, default True :param columns: passed without change to dask.read_hdf() """ if sorted_index is None: sorted_index = True if range_coordinates is None: # not specify start and stop. print("h5_load_range_by_coord(all)") # ?! This is only option in dask to load sorted index ddpart = dd.read_hdf(db_path, table, chunksize=chunksize, lock=True, mode='r', columns=columns, sorted_index=sorted_index) elif not len(range_coordinates): raise Ex_nothing_done('no data') else: ddpart_size = -np.subtract(*range_coordinates) if not ddpart_size: return dd.from_array( np.zeros(0, dtype=[ ('name', 'O'), ('index', 'M8') ])) # DataFrame({},'NoData', {}, []) # None # if ddpart_size < chunksize: # chunksize = ddpart_size # !? needed to not load more data than need # else: chunksize = ddpart_size # !? else loads more data than needs. Do I need to adjust chunksize to divide ddpart_on equal parts? # sorted_index=cfg_in['sorted_index'] not works with start/stop so loading without for c in [False, True]: # try with specified columns first try: ddpart = dd.read_hdf(db_path, table, chunksize=chunksize, lock=True, mode='r', columns=columns, start=range_coordinates[0], stop=range_coordinates[-1]) break except KeyError: # some of specified columns not exist # use only existed columns with pd.HDFStore(db_path, mode='r') as store: columns = store[table].columns.join(columns, how='inner') print('found columns:', columns.values) # because of no 'sorted_index' we need: ddpart = ddpart.reset_index().set_index(ddpart.index.name or 'index', sorted=sorted_index) # 'Time' return ddpart
import dask from dask import dataframe as dd from zillow.data_utils import add_features, add_date_features if __name__ == "__main__": import sys arg = sys.argv[1] if arg not in ["train", "test", "train_2017", "test_2017"]: sys.exit(1) with dask.set_options(get=dask.get): print(r"input/{}.*.hdf".format(arg)) df = dd.read_hdf(r"input/{}.*.hdf".format(arg), "data", chunksize=1000000) #.set_index("ParcelId") df = add_features(df) df = add_date_features(df) print(df.head()) df.to_hdf(r"input/{}2.*.hdf".format(arg), "data")
os.chdir(w_dir) ccys = ['EURUSD', 'EURCHF', 'EURGBP', 'EURJPY', 'EURAUD'] import time total_start_time = time.time() descriptions = [] for ccy in ccys: for year in range(2003, 2017): start_time = time.time() df = dd.read_hdf(directory_load + '/' + ccy + '-' + str(year) + '.h5', '*') with open(directory_load + '/' + 'Data Description.txt', 'a') as fout: description = df.describe().compute() descriptions.append([ccy, year, description]) fout.write('\n'.join([ '\n\n=======================================================', 'Ccy: ' + ccy + ' Year: ' + str(year), str(description) ])) elapsed = time.time() - start_time with open(directory_load + '/' + 'Data Description.txt', 'a') as fout: fout.write('\nTime Elapsed: ' + str(np.round(elapsed, 2))) total_elapsed = time.time() - total_start_time
#x = dataset[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O #y = dataset[:,7:] # D_cidk upper triangular matrix (Dij | j=>i) #x = df[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O #y = df[:,7:] # D_cidk upper triangular matrix (Dij | j=>i) #dataset.head() df.head(10) #import h5py #import xarray as xr import os import time #filename = os.path.join('data', 'accounts.*.csv') #filename #target = os.path.join('data', 'accounts.h5') #target df_hdf = dd.read_hdf('myh4file.h5', ' ') df_hdf.head() #f = h5py.File(os.path.join('.', 'myh4file.h5'), mode='r') import time import sys sys.path.insert(0, '../../../Utilities/') from plotting import newfig, savefig import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec from mpl_toolkits.axes_grid1 import make_axes_locatable import numpy as np
def time_read_hdf5_meta(self, scheduler): dd.read_hdf('{}/*.hdf5'.format(self.data_dir), 'key')
sarima_ts_backtrans_store_path = data_intermed_nb_fldrpath + "/sarima_ts_backtrans_store.h5" sarima_agg_backtrans_store_path = data_intermed_nb_fldrpath + "/sarima_agg_backtrans_store.h5" summary_stats_store_path = data_intermed_nb_fldrpath + "/summary_stats_store.h5" ts_store_path = data_intermed_nb_fldrpath + "/ts_store.h5" component_models_store_path = data_intermed_nb_fldrpath + "/component_models_store.h5" dask_tmpdir = "/media/disk1/forecast_wiki_traffic/data_intermed/tmp" # compute amount to trim SARIMA estimates at # (trim at max observed times a multiplier) TRIM_MULTIPLIER = 10 ## read in time series ts_daily_long = dd.read_hdf(ts_store_path, "/ts_daily_long", chunksize=803 * 2048) ## get max per time series trimmax = ts_daily_long.groupby("ts_id")["daily_level"].max().compute() trimmax = trimmax.to_frame("trimmax") ## get max per time series with last 60 truncated with pd.HDFStore(ts_store_path) as s: n = s.get_storer("/ts_daily_long").nrows t_max = s.select("/ts_daily_long", start=n - 1, stop=n) t_max = t_max.index.values[0][1] del n ts_daily_long_trunc60 = ts_daily_long.reset_index() ts_daily_long_trunc60 = ts_daily_long_trunc60[ts_daily_long_trunc60.time_d < (
target = os.path.join('data', 'accounts.h5') target # In[6]: get_ipython().magic("time df_csv.to_hdf(target, '/data')") # In[7]: df_hdf = dd.read_hdf(target, '/data') df_hdf.head() # ### Compare CSV to HDF5 speeds # We do a simple computation that requires reading a column of our dataset and compare performance between CSV files and our newly created HDF5 file. Which do you expect to be faster? # In[8]: get_ipython().magic('time df_csv.amount.sum().compute()') # In[9]:
def _load_basic_dataframe(df_file=None, datatype='sim', config='IC86.2012', energy_reco=True, energy_cut_key='reco_log_energy', log_energy_min=None, log_energy_max=None, columns=None, n_jobs=1, verbose=False, compute=True): validate_datatype(datatype) if df_file is not None: files = df_file else: paths = get_config_paths() file_pattern = os.path.join(paths.comp_data_dir, config, datatype, 'processed_hdf', 'nominal' if datatype == 'sim' else '', '*.hdf') files = sorted(glob.glob(file_pattern)) ddf = dd.read_hdf(files, key='dataframe', mode='r', columns=columns, chunksize=10000) # Energy reconstruction if energy_reco: model_dict = load_trained_model( 'linearregression_energy_{}'.format(config), return_metadata=True) pipeline = model_dict['pipeline'] feature_list = list(model_dict['training_features']) def add_reco_energy(partition): partition['reco_log_energy'] = pipeline.predict( partition[feature_list]) partition['reco_energy'] = 10**partition['reco_log_energy'] return partition ddf = ddf.map_partitions(add_reco_energy) # Energy range cut if log_energy_min is not None and log_energy_max is not None: def apply_energy_cut(partition): energy_mask = (partition[energy_cut_key] > log_energy_min) & ( partition[energy_cut_key] < log_energy_max) return partition.loc[energy_mask, :] ddf = ddf.map_partitions(apply_energy_cut) if compute: if verbose: pbar = ProgressBar() pbar.register() scheduler = 'processes' if n_jobs > 1 else 'synchronous' df = ddf.compute(scheduler=scheduler, num_workers=n_jobs) df = df.reset_index(drop=True) else: df = ddf return df
def prep_final_unadj_component_models(sarima_ts_backtrans_key, sarima_agg_backtrans_key, component_models_intermed_key, trimmax_var, seas_stat_mod_vars, chunksize): ## combine SARIMA estimates for both aggregates and time series levels ### prep time series level model intermediate estimates sarima_ts_backtrans = dd.read_hdf(sarima_ts_backtrans_store_path, sarima_ts_backtrans_key, chunksize=chunksize) sarima_ts_backtrans = sarima_ts_backtrans.reset_index() col_ren_dict = { "daily_level_predbt": "mod_ts_daily_level_Bt", "daily_wowGr_predbt": "mod_ts_daily_wowGr_Bt", "weekly_level_predbt": "mod_ts_weekly_level_Bt", "weekly_wowGr_predbt": "mod_ts_weekly_wowGr_Bt" } sarima_ts_backtrans = sarima_ts_backtrans.rename(columns=col_ren_dict) del col_ren_dict ### prep aggregate level model intermediate estimates sarima_agg_backtrans = dd.read_hdf(sarima_agg_backtrans_store_path, sarima_agg_backtrans_key, chunksize=chunksize) col_ren_dict = { "daily_level_shrtAdj_predbt": "mod_agg_daily_level_Bt", "daily_wowGr_shrtAdj_predbt": "mod_agg_daily_wowGr_Bt", "weekly_level_shrtAdj_predbt": "mod_agg_weekly_level_Bt", "weekly_wowGr_shrtAdj_predbt": "mod_agg_weekly_wowGr_Bt" } sarima_agg_backtrans = sarima_agg_backtrans.rename(columns=col_ren_dict) del col_ren_dict ### put together intermediate SARIMA estimates component_models_intermed = dd.merge(sarima_ts_backtrans, sarima_agg_backtrans, on=["ts_id", "time_d"]) ## trim outliers from SARIMA estimates component_models_intermed = dd.merge( component_models_intermed, trimmax_df[[trimmax_var ]].rename(columns={trimmax_var: "trimmax_this"}), on=["ts_id"], left_index=False, right_index=True) cols = [ "mod_ts_daily_level_Bt", "mod_ts_daily_wowGr_Bt", "mod_ts_weekly_level_Bt", "mod_ts_weekly_wowGr_Bt", "mod_agg_daily_level_Bt", "mod_agg_daily_wowGr_Bt", "mod_agg_weekly_level_Bt", "mod_agg_weekly_wowGr_Bt" ] for v in cols: component_models_intermed[v + "Trim"] = component_models_intermed[[ v, "trimmax_this" ]].min(axis=1, skipna=False) del cols ## combine summary stat estimates with pd.HDFStore(summary_stats_store_path, mode="r") as s: sstat = s.select("/ts_stat_dayofweek", columns=list(seas_stat_mod_vars.keys())) sstat.rename(columns=seas_stat_mod_vars, inplace=True) sstat.reset_index(inplace=True) component_models_intermed[ "day_of_week"] = component_models_intermed.time_d.dt.dayofweek component_models_intermed = dd.merge(component_models_intermed, sstat, on=["ts_id", "day_of_week"]) del component_models_intermed["day_of_week"] # save results to h5 component_models_intermed.to_hdf(component_models_store_path, key=component_models_intermed_key, compute=True, format="table", data_columns=["ts_id", "time_d"]) # check counts with pd.HDFStore(component_models_store_path, mode="r") as s: n0 = s.get_storer(component_models_intermed_key).nrows with pd.HDFStore(sarima_ts_backtrans_store_path, mode="r") as s: n1 = s.get_storer(sarima_ts_backtrans_key).nrows with pd.HDFStore(sarima_agg_backtrans_store_path, mode="r") as s: n2 = s.get_storer(sarima_agg_backtrans_key).nrows assert (n0 == n1) assert (n1 == n2)
def test_to_hdf_lock_delays(): pytest.importorskip("tables") df16 = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) a = dd.from_pandas(df16, 16) # adding artifichial delays to make sure last tasks finish first # that's a way to simulate last tasks finishing last def delayed_nop(i): if i[1] < 10: sleep(0.1 * (10 - i[1])) return i # saving to multiple hdf nodes with tmpfile() as fn: a = a.apply(delayed_nop, axis=1, meta=a) a.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out) # saving to multiple hdf files # adding artifichial delays to make sure last tasks finish first with tmpdir() as dn: fn = os.path.join(dn, "data*") a = a.apply(delayed_nop, axis=1, meta=a) a.to_hdf(fn, "/data") out = dd.read_hdf(fn, "/data") assert_eq(df16, out)
def create_unigram_book_counts(self, newtable=True, ingest=True, index=True, reverse_index=True, table_count=1): import time t0 = time.time() db = self.db ngramname = "unigrams" tablenameroot = "master_bookcounts" # If you are splitting the input into multiple tables # to be joined as a merge table, come up with multiple # table names and we'll cycle through. if table_count == 1: tablenames = [tablenameroot] elif table_count > 1: tablenames = ["%s_p%d" % (tablenameroot, i) for i in range(1, table_count+1)] else: logging.error("You need a positive integer for table_count") raise grampath = ".bookworm/texts/encoded/%s" % ngramname tmpdir = "%s/tmp" % grampath if (len(grampath) == 0) or (grampath == "/"): logging.error("Woah! Don't set the ngram path to your system root!") raise if newtable: if os.path.exists(tmpdir): import shutil shutil.rmtree(tmpdir) logging.info("Dropping older %s table, if it exists" % ngramname) for tablename in tablenames: db.query("DROP TABLE IF EXISTS " + tablename) logging.info("Making a SQL table to hold the %s" % ngramname) reverse_index_sql = "INDEX(bookid,wordid,count), " if reverse_index else "" for tablename in tablenames: db.query("CREATE TABLE IF NOT EXISTS " + tablename + " (" "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql + "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), " "count MEDIUMINT UNSIGNED NOT NULL);") if ingest: for tablename in tablenames: db.query("ALTER TABLE " + tablename + " DISABLE KEYS") db.query("set NAMES utf8;") db.query("set CHARACTER SET utf8;") logging.info("loading data using LOAD DATA LOCAL INFILE") files = os.listdir(grampath) for i, filename in enumerate(files): if filename.endswith('.txt'): # With each input file, cycle through each table in tablenames tablename = tablenames[i % len(tablenames)] logging.debug("Importing txt file, %s (%d/%d)" % (filename, i, len(files))) try: db.query("LOAD DATA LOCAL INFILE '" + grampath + "/" + filename + "' INTO TABLE " + tablename +" CHARACTER SET utf8 (bookid,wordid,count);") except KeyboardInterrupt: raise except: logging.debug("Falling back on insert without LOCAL DATA INFILE. Slower.") try: import pandas as pd df = pd.read_csv(grampath + "/" + filename, sep='\t', header=None) to_insert = df.apply(tuple, axis=1).tolist() db.query( "INSERT INTO " + tablename + " (bookid,wordid,count) " "VALUES (%s, %s, %s);""", many_params=to_insert ) except KeyboardInterrupt: raise except: logging.exception("Error inserting %s from %s" % (ngramname, filename)) continue elif filename.endswith('.h5'): logging.info("Importing h5 file, %s (%d/%d)" % (filename, i, len(files))) try: # When encountering an .h5 file, this looks for ngram information # in a /#{ngramnames} table (e.g. /unigrams) and writes it out to # temporary TSV files. # Dask is used here simply because it's a dead simple way to multithread # the TSV writing and lower the overhead versus having a TSV already staged. import csv import pandas as pd try: import dask.dataframe as dd except: logging.exception("Ingesting h5 files requires dask") try: os.makedirs(tmpdir) except OSError: if not os.path.isdir(tmpdir): raise # Dask will use #{n_cores-1} threads when saving CSVs. # Ingest and key reload times are identical to txt import, so the only # additional overhead is reading the file (small effect) and writing the csv. ddf = dd.read_hdf(grampath + "/" + filename, ngramname, mode='r', chunksize=2000000) ddf.reset_index().to_csv(tmpdir + '/tmp.*.tsv', index=False, sep='\t', header=False, quoting=csv.QUOTE_NONNUMERIC) logging.info("CSV written from H5. Time passed: %.2f s" % (time.time() - t0)) for j, tmpfile in enumerate(os.listdir(tmpdir)): # With each input file, cycle through each table in tablenames tablename = tablenames[j % len(tablenames)] path = "%s/%s" % (tmpdir, tmpfile) db.query("LOAD DATA LOCAL INFILE '" + path + "' " "INTO TABLE " + tablename + " " "CHARACTER SET utf8 (bookid,wordid,count);") try: os.remove(path) except: pass logging.info("CSVs input. Time passed: %.2f s" % (time.time() - t0)) except KeyboardInterrupt: raise except: logging.exception("Error inserting %s from %s" % (ngramname, filename)) continue else: continue if index: logging.info("Creating Unigram Indexes. Time passed: %.2f s" % (time.time() - t0)) for tablename in tablenames: db.query("ALTER TABLE " + tablename + " ENABLE KEYS") if table_count > 1: logging.info("Creating a merge table for " + ",".join(tablenames)) db.query("CREATE TABLE IF NOT EXISTS " + tablenameroot + " (" "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql + "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), " "count MEDIUMINT UNSIGNED NOT NULL) " "ENGINE=MERGE UNION=(" + ",".join(tablenames) + ") INSERT_METHOD=LAST;") logging.info("Unigram index created in: %.2f s" % ((time.time() - t0)))
def test_to_hdf_multiple_nodes(): pytest.importorskip("tables") df = pd.DataFrame({ "x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4] }, index=[1.0, 2.0, 3.0, 4.0]) a = dd.from_pandas(df, 2) df16 = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) b = dd.from_pandas(df16, 16) # saving to multiple nodes with tmpfile("h5") as fn: a.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df, out) # saving to multiple nodes making sure order is kept with tmpfile("h5") as fn: b.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out) # saving to multiple datasets with custom name_function with tmpfile("h5") as fn: a.to_hdf(fn, "/data_*", name_function=lambda i: "a" * (i + 1)) out = dd.read_hdf(fn, "/data_*") assert_eq(df, out) out = pd.read_hdf(fn, "/data_a") tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(fn, "/data_aa") tm.assert_frame_equal(out, df.iloc[2:]) # test multiple nodes with hdf object with tmpfile("h5") as fn: with pd.HDFStore(fn) as hdf: b.to_hdf(hdf, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out)
import icae.tools.plot_data from icae.tools import EMD from icae.tools import nn from icae.tools import status_report from icae.tools import AE_training from icae.tools import AE_single as AEs_tools from icae.tools import data_loader from icae.models import single_event as AE_models from icae.tools.config_loader import config # - # FIXME: deprecated references (data, optimal_NB). Does 02c-single-waveform-torch.py replace this? df = dd.read_hdf("../" + config.data.retabled + "*.hdf", key=config.data.hdf_key) model, encoder = AE_models.optimal_NB(3, loss_method=loss_method) hist = AE_training.train(model, data, verbose=1, epochs=6, batch_size=1000) status_report.init(model, "BN 1k-bx100e new preprocessor", "-") status_report.save_plot("loss") # data = AE_lib.preprocess(AE_lib.load_mc()) AE_training.plot_results(model, data[:100000]) status_report.save_plot("overview-no-translation") inlier, outlier = AEs_tools.seperate_outliers(model, data) status_report.save_plot("outlier-seperation") status_report.save_obj({ "inlier indices": inlier, "outlier indices": outlier
import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, Imputer from sklearn.model_selection import LeaveOneGroupOut, cross_val_predict from zillow import modelling import pickle as pkl import dask.dataframe as dd train_df = pd.concat([ #dd.read_hdf("input/train2.*.hdf", "data").compute(), dd.read_hdf("input/train_20172.*.hdf", "data").compute(), ]) month = train_df["yearmonth"] train_y = train_df['logerror'] tolerance = 100 y = np.clip(train_y, np.median(train_y)-tolerance, np.median(train_y)+tolerance) cv = LeaveOneGroupOut() train_df = pd.concat([ pd.read_csv("stack_stage1_{}.csv".format(i), index_col=0) for i in [1,2,3] ], axis=1) last_month = month.max() filt = (month == last_month)
columns = [f'c{i}' for i in range(table_size[0])] df = pd.DataFrame(np.random.rand(*table_size),columns=columns,index=[index]*table_size[0]) return df data_file = config.root + config.data.retabled_single # %% #f = pd.HDFStore(data_file,'r') f_raw = h5py.File(data_file, 'w') # %% store = f.get_storer('frame') # %% type(store) # %% ddf = dd.read_hdf(data_file,'frame') # %% ddf['x'].max().compute() # %% # def maximum(f): chunksize = 1000000 frame = 0 col = 'x' maximum = 0 # %% while True: try: df = f.select('frame',f"frame>={frame} & frame < {frame+chunksize}") maximum = max([maximum,df[col].max()])