def test_add_dataset(self): app = vx.ui.main.VaexApp() ds = vx.example() app.dataset_selector.add(ds) self.assert_(not app.dataset_selector.is_empty()) self.assertEqual(int(app.dataset_panel.label_length.text().replace(",", "")), len(ds)) self.assertEqual(ds, app.current_dataset)
def test_column_file(): path = vaex.example().dataset.path df = vaex.open(path, nommap=True) x = df.dataset._columns['x'] assert isinstance(x, vaex.file.column.ColumnFile) df = vaex.from_arrays(x=x) # will trigger fingerprint x.fingerprint() # just to be sure
def test_widget_counter_selection(flush_guard): df = vaex.example() c = df.widget.counter_selection('test', lazy=True) assert c.value == 0 # we don't want to *cause* an execution df.select(df.x > 0, name='test') assert c.value == 0 # flush() # but will update when once happens df.sum(df.x) count_pos = df.count(selection='test') assert c.value == count_pos df.select(df.x < 0, name='test') assert c.value == count_pos df.sum(df.x) count_neg = df.count(selection='test') assert c.value == count_neg # existing selection, and non-lazy c = df.widget.counter_selection('test') assert c.value == count_neg df.select(df.x > 0, name='test') assert c.value == count_pos df.select(df.x < 0, name='test') assert c.value == count_neg flush(all=True)
def test_widget_selection(flush_guard): df = vaex.example() with pytest.raises(ValueError) as e: selection_widget_default = df.widget.selection_expression() assert "'default'" in str(e.value) counts = {'default': 0, 'pos': 0} @df.signal_selection_changed.connect def update(df, name): nonlocal counts counts[name] += 1 count_pos = df.count(selection=df.x > 0) df.select(df.x > 0) selection_widget_default = df.widget.selection_expression() assert selection_widget_default.value.expression == '(x > 0)' selection_widget = df.widget.selection_expression(df.x > 0, name='pos') assert selection_widget_default.value.expression == '(x > 0)' # selection_widget = df.widget.selection(df.x > 0, name='pos') assert counts == {'default': 2, 'pos': 1} assert df.count(selection='pos') == count_pos selection_widget.v_model = 'x < 0' assert selection_widget.error_messages is None assert counts == {'default': 2, 'pos': 2} flush(all=True)
def test_widget_heatmap(flush_guard): df = vaex.example() df.select_rectangle('x', 'y', [[-10, 10], [-50, 50]], name='check') check_rectangle = df.count(selection='check') df.select(df.x > 0) check_positive = df.count(selection=True) heatmap = df.widget.heatmap('x', 'y', selection=[None, True]) flush() assert heatmap.model.grid[1].sum().item() == check_positive - 1 toolbar = heatmap.toolbar toolbar.interact_value = "pan-zoom" assert isinstance(heatmap.plot.figure.interaction, bqplot.interacts.PanZoom) toolbar.interact_value = "select-rect" assert isinstance(heatmap.plot.figure.interaction, bqplot.interacts.BrushSelector) heatmap.plot.figure.interaction.selected_x = [-10, 10] heatmap.plot.figure.interaction.selected_y = [-50, 50] assert heatmap.model.grid.shape[0] == 2 flush() assert heatmap.model.grid[1].sum().item() == check_rectangle toolbar.interact_value = "doesnotexit" assert heatmap.plot.figure.interaction is None # vizdata = heatmap.plot.mark.image.tolist() # heatmap.model.x_slice = 10 # assert heatmap.plot.mark.image.tolist() != vizdata vizdata = heatmap.plot.mark.image.value heatmap.model.x.max = 10 flush(all=True) assert heatmap.plot.mark.image.value != vizdata, "image should change"
def test_expression(): df = vaex.example() expression = df.widget.expression() assert expression.value is None expression.value = 'x' assert expression.value.expression == 'x' assert expression.valid assert expression.error_messages is None assert "good" in expression.success_messages flush(all=True) assert expression.error_messages is None assert expression.success_messages is None expression.v_model = 'x+' assert not expression.valid assert expression.error_messages is not None assert expression.success_messages is None flush() assert expression.error_messages is not None assert expression.success_messages is None expression = df.widget.expression(df.y) assert expression.value == 'y' axis = vaex.jupyter.model.Axis(df=df, expression=df.x + 2) expression = df.widget.expression(axis) assert str(expression.value) == '(x + 2)' axis.expression = df.x + 3 assert str(expression.value) == '(x + 3)'
def test_sklearn_incremental_predictor_serialize(tmpdir): df = vaex.example() df_train, df_test = df.ml.train_test_split(test_size=0.1, verbose=False) features = df_train.column_names[:6] target = 'FeH' incremental = IncrementalPredictor(model=SGDRegressor(), features=features, batch_size=10 * 1000, num_epochs=5, shuffle=True, prediction_name='pred') incremental.fit(df=df_train, target=target) df_train = incremental.transform(df_train) # State transfer - serialization df_train.state_write(str(tmpdir.join('test.json'))) df_test.state_load(str(tmpdir.join('test.json'))) assert df_train.column_count() == df_test.column_count() assert df_test.pred.values.shape == (33000, ) pred_in_memory = incremental.predict(df_test) np.testing.assert_array_almost_equal(pred_in_memory, df_test.pred.values, decimal=1)
def test_cache_hash(): # TODO: what if the directory is not writable? # ds1 = dataset.DatasetArrays(x=x, y=y) path_data = HERE / 'data' / 'test.hdf5' if path_data.exists(): path_data.unlink() path_hashes = HERE / 'data' / 'test.hdf5.d' / 'hashes.yaml' if path_hashes.exists(): path_hashes.unlink() df = vaex.example()[:10] df.export(str(path_data)) df2 = vaex.open(str(path_data)) assert df2.dataset._hash_calculations == 0 assert not path_hashes.exists() df2 = df2.hashed() assert df2.dataset._hash_calculations > 0 assert path_hashes.exists() # and pickling ds = df2.dataset ds2 = rebuild(ds) assert ds2._hash_calculations == 0 assert ds == ds2 df3 = vaex.open(str(path_data)) ds3 = df3.dataset assert ds3._hash_calculations == 0 assert ds3 == ds2
def test_sklearn_incremental_predictor_partial_fit_calls( batch_size, num_epochs): df = vaex.example() df_train, df_test = df.ml.train_test_split(test_size=0.1, verbose=False) features = df_train.column_names[:6] target = 'FeH' N_total = len(df_train) num_batches = (N_total + batch_size - 1) // batch_size # Create a mock model for counting the number of samples seen and partial_fit calls class MockModel(): def __init__(self): self.n_samples_ = 0 self.n_partial_fit_calls_ = 0 def partial_fit(self, X, y): self.n_samples_ += X.shape[0] self.n_partial_fit_calls_ += 1 incremental = IncrementalPredictor(model=MockModel(), features=features, batch_size=batch_size, num_epochs=num_epochs, shuffle=False, prediction_name='pred') incremental.fit(df=df_train, target=target) assert incremental.model.n_samples_ == N_total * num_epochs assert incremental.model.n_partial_fit_calls_ == num_batches * num_epochs
def test_plot_widget_bqplot(): # basic coverage for now df = vaex.example() df.plot_widget(df.x, df.y) df.plot_widget(df.x.astype('float32'), df.y.astype('float32')) df.plot_widget(df.x.astype('float32'), df.y.astype('float32'), limits='minmax')
def test_df_apply(self): df = vaex.example() def func(x, y): return (x + y) / (x - y) apply_func = df.apply(func, arguments=[df.x, df.y]) print(apply_func)
def test_data_array_view(flush_guard): df = vaex.example() x = vaex.jupyter.model.Axis(df=df, expression='x') y = vaex.jupyter.model.Axis(df=df, expression='y') view = df.widget.data_array(axes=[x, y]) flush(all=True) assert view.model.grid is not None
def test_observe_expression(): call_counter = MagicMock() df = vaex.example() w = SomeWidget(df=df, expression=df.x + 1) w.observe(call_counter, 'expression') call_counter.assert_not_called() w.expression = '(x + 2)' call_counter.assert_called_once()
def test_add_dataset(self): app = vx.ui.main.VaexApp() ds = vx.example() app.dataset_selector.add(ds) self.assert_(not app.dataset_selector.is_empty()) self.assertEqual( int(app.dataset_panel.label_length.text().replace(",", "")), len(ds)) self.assertEqual(ds, app.current_dataset)
def test_to_json(): df = vaex.example() w = SomeWidget(df=df, expression=df.x + 1) state = w.get_state() assert state['expression'] == '(x + 1)' state = state.copy() state['expression'] = '(x + 2)' w.set_state(state) w.expression.expression == '(x + 2)'
def test_validate_expression(): df = vaex.example() w = SomeWidget(df=df, expression=df.x + 1) w.expression = '(x + 2)' assert w.expression.expression == '(x + 2)' with pytest.raises(SyntaxError): w.expression = 'x + ' with pytest.raises(NameError): w.expression = 'x2 + 1' assert w.expression.expression == '(x + 2)'
def test_df_evaluate(self): df = vaex.example() def func(x, y): return (x + y) / (x - y) apply_func = df.apply(func, arguments=[df.x, df.y]) df['new_col'] = df.evaluate(apply_func) print(df.min(df['new_col'])) print(df.mean(df['new_col'])) print(df.max(df['new_col']))
def test_hashable(): # tests if we can use datasets as keys of dicts x = np.arange(10) y = x**2 ds1 = dataset.DatasetArrays(x=x, y=y).hashed() df = vaex.example() some_dict = {ds1: '1', df.dataset: '2'} assert some_dict[ds1] == '1' assert some_dict[df.dataset] == '2' assert some_dict[rebuild(ds1)] == '1' assert some_dict[rebuild(df.dataset)] == '2'
def test_column(): df = vaex.example() column = df.widget.column() assert column.value is None column = df.widget.column(df.y) assert column.value == 'y' axis = vaex.jupyter.model.Axis(df=df, expression=df.x) column_widget = df.widget.column(axis) assert str(column_widget.value) == 'x' axis.expression = df.y assert str(column_widget.value) == 'y'
def test_table(self): ds = vx.example() self.app.dataset_selector.add(ds) self.dataset.set_current_row(3) table = self.app.dataset_panel.tableview() self.dataset.set_current_row(0) model = table.tableModel.createIndex(1, 1) table.tableView.pressed.emit(model) self.assertEqual(ds.get_current_row(), 1) QtTest.QTest.qWait(100) # make sure it gets rendered (is this the good way?) #QtTest.QTest.mouseClick(table.count_from_zero, QtCore.Qt.LeftButton) table.count_from_zero.setCheckState(QtCore.Qt.Checked) QtTest.QTest.qWait(10000) # make sure it gets rendered (is this the good way?)
def test_column_names(df_arrow): ds = df_arrow columns_names = ds.get_column_names(virtual=True) ds['__x2'] = ds.x assert columns_names == ds.get_column_names(virtual=True) assert '__x2' in ds.get_column_names(virtual=True, hidden=True) ds = vaex.example() ds['__x'] = ds['x'] + 1 assert 'FeH' in ds.get_column_names(regex='e*') assert 'FeH' not in ds.get_column_names(regex='e') assert '__x' not in ds.get_column_names(regex='__x') assert '__x' in ds.get_column_names(regex='__x', hidden=True)
def test_correlation(): df = vaex.example() # A single column pair xy = yx = df.correlation('x', 'y') xy_expected = np.corrcoef(df.x.values, df.y.values)[0, 1] np.testing.assert_array_almost_equal(xy, xy_expected, decimal=5) np.testing.assert_array_almost_equal(df.correlation('x', 'y'), df.correlation('y', 'x')) xx = df.correlation('x', 'x') yy = df.correlation('y', 'y') zz = df.correlation('z', 'z') zx = xz = df.correlation('x', 'z') zy = yz = df.correlation('y', 'z') # A list of columns result = df.correlation(x=['x', 'y', 'z']) expected3 = expected = np.array(([xx, xy, xz], [yx, yy, yz], [zx, zy, zz])) np.testing.assert_array_almost_equal(result, expected) # A list of columns and a single target desired = df.correlation(x=['x', 'y', 'z'], y='z') expected = np.array([xz, yz, zz]) np.testing.assert_array_almost_equal(desired, expected) result = df.correlation(x=['x', 'y', 'z'], y=['y', 'z']) assert result.shape == (3, 2) expected = np.array(([xy, xz], [yy, yz], [zy, zz])) np.testing.assert_array_almost_equal(result, expected) result = df.correlation(x=['x', 'y', 'z'], y=['y', 'z']) result = df.correlation(['x', 'y'], binby='x', shape=4, limits=[-2, 2]) result0 = df.correlation(['x', 'y'], selection=(df.x >= -2) & (df.x < -1)) np.testing.assert_array_almost_equal(result[0], result0) xar = df.correlation(['x', 'y', 'z'], array_type='xarray') np.testing.assert_array_almost_equal(xar.data, expected3) assert xar.dims == ("x", "y") assert xar.coords['x'].data.tolist() == ['x', 'y', 'z'] assert xar.coords['y'].data.tolist() == ['x', 'y', 'z'] dfc = df.correlation([('x', 'y'), ('x', 'z'), ('y', 'z')]) assert len(dfc) == 3 assert dfc['x'].tolist() == ['x', 'x', 'y'] assert dfc['y'].tolist() == ['y', 'z', 'z'] np.testing.assert_array_almost_equal(dfc['correlation'].tolist(), [xy, xz, yz])
def webserver(request, webserver_fastapi, webserver_tornado, df_server, df_server_huge): webserver = locals()[request.param] df_example = vaex.example() df = df_server.copy() df = df.materialize('z') # in the fastapi we drop the state df.drop('obj', inplace=True) df.drop('datetime', inplace=True) df.drop('timedelta', inplace=True) df.name = 'test' df_example.name = 'example' webserver.set_datasets([df, df_server_huge, df_example]) return webserver
def test_column_names(ds_local): ds = ds_local columns_names = ds.get_column_names(virtual=True) ds['__x'] = ds.x assert columns_names == ds.get_column_names(virtual=True) assert '__x' in ds.get_column_names(virtual=True, hidden=True) assert len(columns_names) == len(ds.get_column_names(virtual=True, hidden=True))-1 ds = vaex.example() ds['__x'] = ds['x'] + 1 assert 'FeH' in ds.get_column_names(regex='e*') assert 'FeH' not in ds.get_column_names(regex='e') assert '__x' not in ds.get_column_names(regex='__x') assert '__x' in ds.get_column_names(regex='__x', hidden=True)
def test_table(self): ds = vx.example() self.app.dataset_selector.add(ds) self.dataset.set_current_row(3) table = self.app.dataset_panel.tableview() self.dataset.set_current_row(0) model = table.tableModel.createIndex(1, 1) table.tableView.pressed.emit(model) self.assertEqual(ds.get_current_row(), 1) QtTest.QTest.qWait( 100) # make sure it gets rendered (is this the good way?) #QtTest.QTest.mouseClick(table.count_from_zero, QtCore.Qt.LeftButton) table.count_from_zero.setCheckState(QtCore.Qt.Checked) QtTest.QTest.qWait( 10000) # make sure it gets rendered (is this the good way?)
def test_mutual_information(): df = vaex.example() # A single pair xy = yx = df.mutual_information('x', 'y') expected = np.array(0.068934) np.testing.assert_array_almost_equal(xy, expected) np.testing.assert_array_almost_equal(df.mutual_information('y', 'x'), df.mutual_information('x', 'y')) xx = df.mutual_information('x', 'x') yy = df.mutual_information('y', 'y') zz = df.mutual_information('z', 'z') zx = xz = df.mutual_information('x', 'z') zy = yz = df.mutual_information('y', 'z') # A list of columns result = df.mutual_information(x=['x', 'y', 'z']) expected = np.array(([xx, xy, xz], [yx, yy, yz], [zx, zy, zz])) np.testing.assert_array_almost_equal(result, expected) # A list of columns and a single target result = df.mutual_information(x=['x', 'y', 'z'], y='z') expected = np.array([xz, yz, zz]) np.testing.assert_array_almost_equal(result, expected) # A list of columns and targets result = df.mutual_information(x=['x', 'y', 'z'], y=['y', 'z']) assert result.shape == (3, 2) expected = np.array(([xy, xz], [yy, yz], [zy, zz] )) np.testing.assert_array_almost_equal(result, expected) # a list of custom pairs result = df.mutual_information(x=[['x', 'y'], ['x', 'z'], ['y', 'z']]) assert result.shape == (3,) expected = np.array([xy, xz, yz]) np.testing.assert_array_almost_equal(result, expected) result = df.mutual_information(x=['x', 'y'], dimension=3, mi_shape=4) assert result.shape == (2, 2, 2)
def test_percentile_1d(): x = np.array([0, 0, 10, 100, 200]) df = vaex.from_arrays(x=x) median = df.median_approx(df.x) assert median < 10. x = np.array([0, 0, 90, 100, 200]) df = vaex.from_arrays(x=x) median = df.median_approx(df.x) assert median > 90. # coverage test df = vaex.example() df.percentile_approx('x', percentage=80, binby=df.z, limits='minmax', shape=100)
def test_accessor_nested(): df = vaex.example() vaex._add_lazy_accessor('spam.egg', lambda: Egg) with pytest.raises(expected_exception=AttributeError): a = df.spam vaex._add_lazy_accessor('spam.egg.foo', lambda: Foo) with pytest.raises(expected_exception=AttributeError): a = df.spam vaex._add_lazy_accessor('spam', lambda: Spam) assert df.spam is df.spam assert df.spam.df is df assert isinstance(df.spam, Spam) assert df.spam.egg is df.spam.egg assert df.spam.egg.spam is df.spam assert isinstance(df.spam.egg, Egg) assert df.spam.egg.foo is df.spam.egg.foo assert df.spam.egg.foo.df is df.spam.egg # abuse of foo assert isinstance(df.spam.egg.foo, Foo)
def main(argv): parser = argparse.ArgumentParser(argv[0]) parser.add_argument("filename", help="filename for dataset", nargs='*') parser.add_argument("--address", help="address to bind the server to (default: %(default)s)", default="0.0.0.0") parser.add_argument("--port", help="port to listen on (default: %(default)s)", type=int, default=9000) parser.add_argument('--verbose', '-v', action='count', default=2) parser.add_argument('--cache', help="cache size in bytes for requests, set to zero to disable (default: %(default)s)", type=int, default=500000000) parser.add_argument('--compress', help="compress larger replies (default: %(default)s)", default=True, action='store_true') parser.add_argument('--no-compress', dest="compress", action='store_false') parser.add_argument('--development', default=False, action='store_true', help="enable development features (auto reloading)") parser.add_argument('--threads-per-job', default=4, type=int, help="threads per job (default: %(default)s)") # config = layeredconfig.LayeredConfig(defaults, env, layeredconfig.Commandline(parser=parser, commandline=argv[1:])) config = parser.parse_args(argv[1:]) verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"] logging.getLogger("vaex").setLevel(verbosity[config.verbose]) # import vaex # vaex.set_log_level_debug() from vaex.settings import webserver as settings # filenames = config.filenames filenames = [] filenames = config.filename datasets = [] for filename in filenames: ds = vx.open(filename) if ds is None: print("error opening file: %r" % filename) else: datasets.append(ds) datasets = datasets or [vx.example()] # datasets = [ds for ds in datasets if ds is not None] logger.info("datasets:") for dataset in datasets: logger.info("\thttp://%s:%d/%s or ws://%s:%d/%s", config.address, config.port, dataset.name, config.address, config.port, dataset.name) server = WebServer(datasets=datasets, address=config.address, port=config.port, cache_byte_size=config.cache, compress=config.compress, development=config.development, threads_per_job=config.threads_per_job) server.serve()
def test_widget_histogram(flush_guard, no_vaex_cache): df = vaex.example() assert df.widget is df.widget df.select_box(['x'], [[-10, 20]], name='check') check_range = df.count(selection='check') df.select(df.x > 0) check_positive = df.count(selection='default') histogram = df.widget.histogram('x', selection=[None, "default"], toolbar=True) flush() assert histogram.model.grid[1].sum( ) == check_positive # for some reason, because 'x' it float32, we don't need -1 toolbar = histogram.toolbar toolbar.interact_value = "pan-zoom" assert isinstance(histogram.plot.figure.interaction, bqplot.interacts.PanZoom) toolbar.interact_value = "select-x" assert isinstance(histogram.plot.figure.interaction, bqplot.interacts.BrushIntervalSelector) histogram.plot.figure.interaction.selected = [-10, 20] flush(all=True) assert histogram.model.grid.shape[0] == 2 assert histogram.model.grid[1].sum() == check_range toolbar.interact_value = "doesnotexit" assert histogram.plot.figure.interaction is None # coverage histogram.plot.highlight(0) histogram.plot.highlight(None) vizdata = histogram.plot.mark.y.tolist() histogram.model.x_slice = 10 assert histogram.plot.mark.y.tolist() == vizdata histogram.dimension_groups = 'slice' assert histogram.plot.mark.y.tolist() != vizdata
def main(argv): parser = argparse.ArgumentParser(argv[0]) parser.add_argument("filename", help="filename for dataset", nargs='*') parser.add_argument("--address", help="address to bind the server to (default: %(default)s)", default="0.0.0.0") parser.add_argument("--port", help="port to listen on (default: %(default)s)", type=int, default=9000) parser.add_argument('--verbose', '-v', action='count', default=2) parser.add_argument('--cache', help="cache size in bytes for requests, set to zero to disable (default: %(default)s)", type=int, default=500000000) parser.add_argument('--compress', help="compress larger replies (default: %(default)s)", default=True, action='store_true') parser.add_argument('--no-compress', dest="compress", action='store_false') parser.add_argument('--development', default=False, action='store_true', help="enable development features (auto reloading)") parser.add_argument('--threads-per-job', default=4, type=int, help="threads per job (default: %(default)s)") #config = layeredconfig.LayeredConfig(defaults, env, layeredconfig.Commandline(parser=parser, commandline=argv[1:])) config = parser.parse_args(argv[1:]) verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"] logging.getLogger("vaex").setLevel(verbosity[config.verbose]) #import vaex #vaex.set_log_level_debug() from vaex.settings import webserver as settings #filenames = config.filenames filenames = [] filenames = config.filename datasets = [] for filename in filenames: ds = vx.open(filename) if ds is None: print("error opening file: %r" % filename) else: datasets.append(ds) datasets = datasets or [vx.example()] #datasets = [ds for ds in datasets if ds is not None] logger.info("datasets:") for dataset in datasets: logger.info("\thttp://%s:%d/%s or ws://%s:%d/%s", config.address, config.port, dataset.name, config.address, config.port, dataset.name) server = WebServer(datasets=datasets, address=config.address, port=config.port, cache_byte_size=config.cache, compress=config.compress, development=config.development, threads_per_job=config.threads_per_job) server.serve()
def test_percentile_approx(): df = vaex.example() # Simple test percentile = df.percentile_approx('z', percentage=99) expected_result = 15.1739 np.testing.assert_almost_equal(percentile, expected_result, decimal=1) # Test for multiple percentages percentiles = df.percentile_approx('x', percentage=[0, 25, 50, 75, 100], percentile_shape=65536) expected_result = [-78.133026, -3.5992, -0.0367, 3.4684, 130.49751] np.testing.assert_array_almost_equal(percentiles, expected_result, decimal=1) # Test for multiple expressions percentiles_2d = df.percentile_approx(['x', 'y'], percentage=[33, 66]) expected_result = np.array(([-2.3310, 1.9540], [-2.4313, 2.1021])) np.testing.assert_array_almost_equal(percentiles_2d, expected_result, decimal=1)
def setUp(self): self.dataset = vx.example()
task._result = task.reduce(task._results) task.fulfill(task._result) # remove references task._result = None task._results = None self.signal_end.emit() # if new tasks were added as a result of this, execute them immediately # TODO: we may want to include infinite recursion protection self._is_executing = False if len(self.task_queue) > 0: logger.debug("task queue not empty.. start over!") self.execute() finally: self._is_executing = False if __name__ == "__main__": import vaex import sys vaex.set_log_level_debug() server = vaex.server(sys.argv[1], port=int(sys.argv[2])) datasets = server.datasets() print(datasets) dataset = datasets[0] dataset = vaex.example() print(dataset("x").minmax()) dataset.select("x < 0") print(dataset.selected_length(), len(dataset)) print(dataset("x").selected().is_masked) print(dataset("x").selected().minmax())