def test_tabular(self): name = "expr" expr = bz.Data(self.df, name=name, dshape=self.dshape) ds = from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values ) self.assertEqual(ds.__name__, name) self.assertTrue(issubclass(ds, DataSet)) self.assertIs(ds.value.dtype, float64_dtype) self.assertIs(ds.int_value.dtype, int64_dtype) self.assertTrue(np.isnan(ds.value.missing_value)) self.assertEqual(ds.int_value.missing_value, 0) invalid_type_fields = ("asof_date",) for field in invalid_type_fields: with self.assertRaises(AttributeError) as e: getattr(ds, field) self.assertIn("'%s'" % field, str(e.exception)) self.assertIn("'datetime'", str(e.exception)) # test memoization self.assertIs( from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ), ds, )
def test_tabular(self): name = 'expr' expr = bz.data(self.df, name=name, dshape=self.dshape) ds = from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ) self.assertEqual(ds.__name__, name) self.assertTrue(issubclass(ds, DataSet)) self.assertIs(ds.value.dtype, float64_dtype) self.assertIs(ds.int_value.dtype, int64_dtype) self.assertTrue(np.isnan(ds.value.missing_value)) self.assertEqual(ds.int_value.missing_value, 0) # test memoization self.assertIs( from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ), ds, )
def test_tabular(self): name = 'expr' expr = bz.Data(self.df, name=name, dshape=self.dshape) ds = from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, ) self.assertEqual(ds.__name__, name) self.assertTrue(issubclass(ds, DataSet)) self.assertEqual( {c.name: c.dtype for c in ds.columns}, { 'sid': np.int64, 'value': np.float64 }, ) for field in ('timestamp', 'asof_date'): with self.assertRaises(AttributeError) as e: getattr(ds, field) self.assertIn("'%s'" % field, str(e.exception)) self.assertIn("'datetime'", str(e.exception)) # test memoization self.assertIs( from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, ), ds, )
def test_tabular(self): name = 'expr' expr = bz.Data(self.df, name=name, dshape=self.dshape) ds = from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, ) self.assertEqual(ds.__name__, name) self.assertTrue(issubclass(ds, DataSet)) self.assertEqual( {c.name: c.dtype for c in ds.columns}, {'sid': np.int64, 'value': np.float64}, ) for field in ('timestamp', 'asof_date'): with self.assertRaises(AttributeError) as e: getattr(ds, field) self.assertIn("'%s'" % field, str(e.exception)) self.assertIn("'datetime'", str(e.exception)) # test memoization self.assertIs( from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, ), ds, )
def test_auto_deltas_fail_raise(self): loader = BlazeLoader() expr = bz.Data(self.df, dshape=self.dshape) with self.assertRaises(ValueError) as e: from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.raise_, ) self.assertIn(str(expr), str(e.exception))
def test_auto_deltas_fail_warn(self): with warnings.catch_warnings(record=True) as ws: warnings.simplefilter("always") loader = BlazeLoader() expr = bz.Data(self.df, dshape=self.dshape) from_blaze(expr, loader=loader, no_deltas_rule=no_deltas_rules.warn, missing_values=self.missing_values) self.assertEqual(len(ws), 1) w = ws[0].message self.assertIsInstance(w, NoDeltasWarning) self.assertIn(str(expr), str(w))
def test_auto_deltas_fail_warn(self): with warnings.catch_warnings(record=True) as ws: warnings.simplefilter('always') loader = BlazeLoader() expr = bz.Data(self.df, dshape=self.dshape) from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.warn, ) self.assertEqual(len(ws), 1) w = ws[0].message self.assertIsInstance(w, NoDeltasWarning) self.assertIn(str(expr), str(w))
def _test_id(self, df, dshape, expected, finder, add): expr = bz.Data(df, name='expr', dshape=dshape) loader = BlazeLoader() ds = from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ) p = Pipeline() for a in add: p.add(getattr(ds, a).latest, a) dates = self.dates with tmp_asset_finder() as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) assert_frame_equal( result, _utc_localize_index_level_0(expected), check_dtype=False, )
def test_id(self): expr = bz.Data(self.df, name='expr', dshape=self.dshape) loader = BlazeLoader() ds = from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, ) p = Pipeline() p.add(ds.value.latest, 'value') dates = self.dates with tmp_asset_finder() as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) expected = self.df.drop('asof_date', axis=1).set_index( ['timestamp', 'sid'], ) expected.index = pd.MultiIndex.from_product(( expected.index.levels[0], finder.retrieve_all(expected.index.levels[1]), )) assert_frame_equal(result, expected, check_dtype=False)
def test_id_macro_dataset(self): expr = bz.Data(self.macro_df, name='expr', dshape=self.macro_dshape) loader = BlazeLoader() ds = from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, ) p = Pipeline() p.add(ds.value.latest, 'value') dates = self.dates asset_info = asset_infos[0][0] with tmp_asset_finder(equities=asset_info) as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) nassets = len(asset_info) expected = pd.DataFrame( list(concatv([0] * nassets, [1] * nassets, [2] * nassets)), index=pd.MultiIndex.from_product(( self.macro_df.timestamp, finder.retrieve_all(asset_info.index), )), columns=('value',), ) assert_frame_equal(result, expected, check_dtype=False)
def _test_id(self, df, dshape, expected, finder, add): expr = bz.data(df, name='expr', dshape=dshape) loader = BlazeLoader() ds = from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ) p = Pipeline() for a in add: p.add(getattr(ds, a).latest, a) dates = self.dates with tmp_asset_finder() as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) assert_frame_equal( result, _utc_localize_index_level_0(expected), check_dtype=False, )
def test_missing_asof(self): expr = bz.Data( self.df.loc[:, ["sid", "value", "timestamp"]], name="expr", dshape=""" var * { sid: ?int64, value: float64, timestamp: datetime, }""", ) with self.assertRaises(TypeError) as e: from_blaze(expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore) self.assertIn("'asof_date'", str(e.exception)) self.assertIn(repr(str(expr.dshape.measure)), str(e.exception))
def _run_pipeline( self, expr, deltas, expected_views, expected_output, finder, calendar, start, end, window_length, compute_fn ): loader = BlazeLoader() ds = from_blaze( expr, deltas, loader=loader, no_deltas_rule=no_deltas_rules.raise_, missing_values=self.missing_values ) p = Pipeline() # prevent unbound locals issue in the inner class window_length_ = window_length class TestFactor(CustomFactor): inputs = (ds.value,) window_length = window_length_ def compute(self, today, assets, out, data): assert_array_almost_equal(data, expected_views[today]) out[:] = compute_fn(data) p.add(TestFactor(), "value") result = SimplePipelineEngine(loader, calendar, finder).run_pipeline(p, start, end) assert_frame_equal(result, _utc_localize_index_level_0(expected_output), check_dtype=False)
def test_custom_query_time_tz(self): df = self.df.copy() df["timestamp"] = ( (pd.DatetimeIndex(df["timestamp"], tz="EST") + timedelta(hours=8, minutes=44)) .tz_convert("utc") .tz_localize(None) ) df.ix[3:5, "timestamp"] = pd.Timestamp("2014-01-01 13:45") expr = bz.Data(df, name="expr", dshape=self.dshape) loader = BlazeLoader(data_query_time=time(8, 45), data_query_tz="EST") ds = from_blaze(expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values) p = Pipeline() p.add(ds.value.latest, "value") p.add(ds.int_value.latest, "int_value") dates = self.dates with tmp_asset_finder() as finder: result = SimplePipelineEngine(loader, dates, finder).run_pipeline(p, dates[0], dates[-1]) expected = df.drop("asof_date", axis=1) expected["timestamp"] = expected["timestamp"].dt.normalize().astype("datetime64[ns]").dt.tz_localize("utc") expected.ix[3:5, "timestamp"] += timedelta(days=1) expected.set_index(["timestamp", "sid"], inplace=True) expected.index = pd.MultiIndex.from_product( (expected.index.levels[0], finder.retrieve_all(expected.index.levels[1])) ) assert_frame_equal(result, expected, check_dtype=False)
def test_custom_query_time_tz(self): df = self.df.copy() df['timestamp'] = ( pd.DatetimeIndex(df['timestamp'], tz='EST') + timedelta(hours=8, minutes=44)).tz_convert('utc').tz_localize(None) df.ix[3:5, 'timestamp'] = pd.Timestamp('2014-01-01 13:45') expr = bz.Data(df, name='expr', dshape=self.dshape) loader = BlazeLoader(data_query_time=time(8, 45), data_query_tz='EST') ds = from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, ) p = Pipeline() p.add(ds.value.latest, 'value') dates = self.dates with tmp_asset_finder() as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) expected = df.drop('asof_date', axis=1) expected['timestamp'] = expected['timestamp'].dt.normalize().astype( 'datetime64[ns]', ) expected.ix[3:5, 'timestamp'] += timedelta(days=1) expected.set_index(['timestamp', 'sid'], inplace=True) expected.index = pd.MultiIndex.from_product(( expected.index.levels[0], finder.retrieve_all(expected.index.levels[1]), )) assert_frame_equal(result, expected, check_dtype=False)
def test_id_macro_dataset(self): expr = bz.Data(self.macro_df, name='expr', dshape=self.macro_dshape) loader = BlazeLoader() ds = from_blaze( expr, loader=loader, no_deltas_rule='ignore', ) p = Pipeline() p.add(ds.value.latest, 'value') dates = self.dates asset_info = asset_infos[0][0] with tmp_asset_finder(asset_info) as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) nassets = len(asset_info) expected = pd.DataFrame( list(concatv([0] * nassets, [1] * nassets, [2] * nassets)), index=pd.MultiIndex.from_product(( self.macro_df.timestamp, finder.retrieve_all(asset_info.index), )), columns=('value', ), ) assert_frame_equal(result, expected, check_dtype=False)
def test_id(self): expr = bz.Data(self.df, name='expr', dshape=self.dshape) loader = BlazeLoader() ds = from_blaze( expr, loader=loader, no_deltas_rule='ignore', ) p = Pipeline() p.add(ds.value.latest, 'value') dates = self.dates with tmp_asset_finder() as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) expected = self.df.drop('asof_date', axis=1).set_index(['timestamp', 'sid'], ) expected.index = pd.MultiIndex.from_product(( expected.index.levels[0], finder.retrieve_all(expected.index.levels[1]), )) assert_frame_equal(result, expected, check_dtype=False)
def _run_pipeline(self, expr, deltas, expected_views, expected_output, finder, calendar, start, end, window_length, compute_fn): loader = BlazeLoader() ds = from_blaze( expr, deltas, loader=loader, no_deltas_rule=no_deltas_rules.raise_, ) p = Pipeline() # prevent unbound locals issue in the inner class window_length_ = window_length class TestFactor(CustomFactor): inputs = ds.value, window_length = window_length_ def compute(self, today, assets, out, data): assert_array_almost_equal(data, expected_views[today]) out[:] = compute_fn(data) p.add(TestFactor(), 'value') result = SimplePipelineEngine( loader, calendar, finder, ).run_pipeline(p, start, end) assert_frame_equal( result, expected_output, check_dtype=False, )
def test_id_macro_dataset_multiple_columns(self): """ input (df): asof_date timestamp other value 0 2014-01-01 2014-01-01 1 0 3 2014-01-02 2014-01-02 2 1 6 2014-01-03 2014-01-03 3 2 output (expected): other value 2014-01-01 Equity(65 [A]) 1 0 Equity(66 [B]) 1 0 Equity(67 [C]) 1 0 2014-01-02 Equity(65 [A]) 2 1 Equity(66 [B]) 2 1 Equity(67 [C]) 2 1 2014-01-03 Equity(65 [A]) 3 2 Equity(66 [B]) 3 2 Equity(67 [C]) 3 2 """ df = self.macro_df.copy() df['other'] = df.value + 1 fields = OrderedDict(self.macro_dshape.measure.fields) fields['other'] = fields['value'] expr = bz.Data(df, name='expr', dshape=var * Record(fields)) loader = BlazeLoader() ds = from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, ) p = Pipeline() p.add(ds.value.latest, 'value') p.add(ds.other.latest, 'other') dates = self.dates asset_info = asset_infos[0][0] with tmp_asset_finder(equities=asset_info) as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) expected = pd.DataFrame( np.array([[0, 1], [1, 2], [2, 3]]).repeat(3, axis=0), index=pd.MultiIndex.from_product(( df.timestamp, finder.retrieve_all(asset_info.index), )), columns=('value', 'other'), ).sort_index(axis=1) assert_frame_equal( result, expected.sort_index(axis=1), check_dtype=False, )
def test_column(self): exprname = 'expr' expr = bz.data(self.df, name=exprname, dshape=self.dshape) value = from_blaze( expr.value, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ) self.assertEqual(value.name, 'value') self.assertIsInstance(value, BoundColumn) self.assertIs(value.dtype, float64_dtype) # test memoization self.assertIs( from_blaze( expr.value, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ), value, ) self.assertIs( from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ).value, value, ) # test the walk back up the tree self.assertIs( from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ), value.dataset, ) self.assertEqual(value.dataset.__name__, exprname)
def test_auto_deltas(self): expr = bz.Data( {"ds": self.df, "ds_deltas": pd.DataFrame(columns=self.df.columns)}, dshape=var * Record((("ds", self.dshape.measure), ("ds_deltas", self.dshape.measure))), ) loader = BlazeLoader() ds = from_blaze(expr.ds, loader=loader, missing_values=self.missing_values) self.assertEqual(len(loader), 1) exprdata = loader[ds] self.assertTrue(exprdata.expr.isidentical(expr.ds)) self.assertTrue(exprdata.deltas.isidentical(expr.ds_deltas))
def test_missing_asof(self): expr = bz.Data( self.df.loc[:, ['sid', 'value', 'timestamp']], name='expr', dshape=""" var * { sid: ?int64, value: float64, timestamp: datetime, }""", ) with self.assertRaises(TypeError) as e: from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, ) self.assertIn("'asof_date'", str(e.exception)) self.assertIn(repr(str(expr.dshape.measure)), str(e.exception))
def gen_data_set(table_name): """读取bcolz格式的数据,生成DataSet""" expr = _gen_expr(table_name) return from_blaze( expr, # loader=None, no_deltas_rule='ignore', no_checkpoints_rule='ignore', # odo_kwargs=gen_odo_kwargs(expr, utc=False), missing_values=fillvalue_for_expr(expr), domain=CN_EQUITIES, )
def gen_data_set(table_name): """读取bcolz格式的数据,生成DataSet""" expr = _gen_expr(table_name) return from_blaze( expr, # loader=global_loader, no_deltas_rule='ignore', no_checkpoints_rule='ignore', # odo_kwargs=gen_odo_kwargs(expr), missing_values=fillvalue_for_expr(expr), domain=CN_EQUITIES, )
def test_non_numpy_field(self): expr = bz.Data( [], dshape=""" var * { a: datetime, asof_date: datetime, timestamp: datetime, }""", ) ds = from_blaze(expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore) with self.assertRaises(AttributeError): ds.a self.assertIsInstance(object.__getattribute__(ds, "a"), NonNumpyField)
def test_tabular(self): name = 'expr' expr = bz.Data(self.df, name=name, dshape=self.dshape) ds = from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ) self.assertEqual(ds.__name__, name) self.assertTrue(issubclass(ds, DataSet)) self.assertIs(ds.value.dtype, float64_dtype) self.assertIs(ds.int_value.dtype, int64_dtype) self.assertTrue(np.isnan(ds.value.missing_value)) self.assertEqual(ds.int_value.missing_value, 0) invalid_type_fields = ('asof_date', ) for field in invalid_type_fields: with self.assertRaises(AttributeError) as e: getattr(ds, field) self.assertIn("'%s'" % field, str(e.exception)) self.assertIn("'datetime'", str(e.exception)) # test memoization self.assertIs( from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ), ds, )
def test_auto_deltas(self): expr = bz.Data( {'ds': self.df, 'ds_deltas': pd.DataFrame(columns=self.df.columns)}, dshape=var * Record(( ('ds', self.dshape.measure), ('ds_deltas', self.dshape.measure), )), ) loader = BlazeLoader() ds = from_blaze(expr.ds, loader=loader) self.assertEqual(len(loader), 1) exprdata = loader[ds] self.assertTrue(exprdata.expr.isidentical(expr.ds)) self.assertTrue(exprdata.deltas.isidentical(expr.ds_deltas))
def test_non_pipeline_field(self): # NOTE: This test will fail if we ever allow string types in # the Pipeline API. If this happens, change the dtype of the `a` field # of expr to another type we don't allow. expr = bz.Data( [], dshape=""" var * { a: string, asof_date: datetime, timestamp: datetime, }""", ) ds = from_blaze(expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore) with self.assertRaises(AttributeError): ds.a self.assertIsInstance(object.__getattribute__(ds, "a"), NonPipelineField)
def test_auto_deltas(self): expr = bz.Data( { 'ds': self.df, 'ds_deltas': pd.DataFrame(columns=self.df.columns) }, dshape=var * Record(( ('ds', self.dshape.measure), ('ds_deltas', self.dshape.measure), )), ) loader = BlazeLoader() ds = from_blaze(expr.ds, loader=loader) self.assertEqual(len(loader), 1) exprdata = loader[ds] self.assertTrue(exprdata.expr.isidentical(expr.ds)) self.assertTrue(exprdata.deltas.isidentical(expr.ds_deltas))
def test_non_numpy_field(self): expr = bz.Data( [], dshape=""" var * { a: datetime, asof_date: datetime, timestamp: datetime, }""", ) ds = from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, ) with self.assertRaises(AttributeError): ds.a self.assertIsInstance(object.__getattribute__(ds, 'a'), NonNumpyField)
def test_complex_expr(self): expr = bz.data(self.df, dshape=self.dshape) # put an Add in the table expr_with_add = bz.transform(expr, value=expr.value + 1) # Test that we can have complex expressions with no deltas from_blaze( expr_with_add, deltas=None, loader=self.garbage_loader, missing_values=self.missing_values, ) with self.assertRaises(TypeError): from_blaze( expr.value + 1, # put an Add in the column deltas=None, loader=self.garbage_loader, missing_values=self.missing_values, ) deltas = bz.data( pd.DataFrame(columns=self.df.columns), dshape=self.dshape, ) with self.assertRaises(TypeError): from_blaze( expr_with_add, deltas=deltas, loader=self.garbage_loader, missing_values=self.missing_values, ) with self.assertRaises(TypeError): from_blaze( expr.value + 1, deltas=deltas, loader=self.garbage_loader, missing_values=self.missing_values, )
def test_non_pipeline_field(self): expr = bz.data( [], dshape=""" var * { a: complex, asof_date: datetime, timestamp: datetime, }""", ) ds = from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, ) with self.assertRaises(AttributeError): ds.a self.assertIsInstance( object.__getattribute__(ds, 'a'), NonPipelineField, )
def test_custom_query_time_tz(self): df = self.df.copy() df['timestamp'] = ( pd.DatetimeIndex(df['timestamp'], tz='EST') + timedelta(hours=8, minutes=44) ).tz_convert('utc').tz_localize(None) df.ix[3:5, 'timestamp'] = pd.Timestamp('2014-01-01 13:45') expr = bz.data(df, name='expr', dshape=self.dshape) loader = BlazeLoader(data_query_time=time(8, 45), data_query_tz='EST') ds = from_blaze( expr, loader=loader, no_deltas_rule=no_deltas_rules.ignore, missing_values=self.missing_values, ) p = Pipeline() p.add(ds.value.latest, 'value') p.add(ds.int_value.latest, 'int_value') dates = self.dates with tmp_asset_finder() as finder: result = SimplePipelineEngine( loader, dates, finder, ).run_pipeline(p, dates[0], dates[-1]) expected = df.drop('asof_date', axis=1) expected['timestamp'] = expected['timestamp'].dt.normalize().astype( 'datetime64[ns]', ).dt.tz_localize('utc') expected.ix[3:5, 'timestamp'] += timedelta(days=1) expected.set_index(['timestamp', 'sid'], inplace=True) expected.index = pd.MultiIndex.from_product(( expected.index.levels[0], finder.retrieve_all(expected.index.levels[1]), )) assert_frame_equal(result, expected, check_dtype=False)
def test_non_pipeline_field(self): # NOTE: This test will fail if we ever allow string types in # the Pipeline API. If this happens, change the dtype of the `a` field # of expr to another type we don't allow. expr = bz.Data( [], dshape=""" var * { a: string, asof_date: datetime, timestamp: datetime, }""", ) ds = from_blaze( expr, loader=self.garbage_loader, no_deltas_rule=no_deltas_rules.ignore, ) with self.assertRaises(AttributeError): ds.a self.assertIsInstance( object.__getattribute__(ds, 'a'), NonPipelineField, )
from datashape import dshape from alphatools.research import loaders, blaze_loader from zipline.data import bundles from zipline.utils.calendars import get_calendar from zipline.pipeline.loaders.blaze import from_blaze from os import path this_file = path.dirname(__file__) with open(path.join(this_file, 'factory/data_sources.json')) as f: data_sources = json.load(f) Factory = {} for source in data_sources.keys(): loc = data_sources[source]['url'] shape = dshape(data_sources[source]['schema']) loc = path.expandvars(loc) expr = bz.data(loc, dshape=shape) # create the DataSet ds = from_blaze(expr, no_deltas_rule='ignore', no_checkpoints_rule='ignore', loader=blaze_loader) Factory = {source: ds}