def test_export(self): path_hdf5 = tempfile.mktemp(".hdf5") path_hdf5_ui = tempfile.mktemp(".hdf5") path_fits = tempfile.mktemp(".fits") path_fits_ui = tempfile.mktemp(".fits") for dataset in [self.dataset]: self.app.dataset_selector.add(dataset) for fraction in [1, 0.5]: dataset.set_active_fraction(fraction) dataset.select("x > 3") length = len(dataset) # TODO: gui doesn't export virtual columns, add "z" to this list for column_names in [["x", "y"], ["x"], ["y"]]: for byteorder in "=<>": for shuffle in [False, True]: for selection in [False, True]: for export in [dataset.export_fits, dataset.export_hdf5] if byteorder == ">" else [dataset.export_hdf5]: type = "hdf5" if export == dataset.export_hdf5 else "fits" if shuffle and selection: continue # TODO: export should fail on this combination #print column_names, byteorder, shuffle, selection, type if export == dataset.export_hdf5: path = path_hdf5 path_ui = path_hdf5_ui export(path, column_names=column_names, byteorder=byteorder, shuffle=shuffle, selection=selection) else: path = path_fits path_ui = path_fits_ui export(path, column_names=column_names, shuffle=shuffle, selection=selection) compare_direct = vx.open(path) dialogs.set_choose(1 if selection else 0).then("=<>".index(byteorder)) # select columns dialogs.set_select_many(True, [name in column_names for name in dataset.get_column_names()]) counter_confirm = CallCounter(return_value=shuffle) counter_info = CallCounter() dialogs.dialog_confirm = counter_confirm dialogs.dialog_info = counter_info dialogs.get_path_save = lambda *args: path_ui dialogs.ProgressExecution = dialogs.FakeProgressExecution import sys sys.stdout.flush() self.app.export(type=type) compare_ui = vx.open(path_ui) column_names = column_names or ["x", "y", "z"] self.assertEqual(compare_direct.get_column_names(), compare_ui.get_column_names()) for column_name in column_names: values_ui = compare_ui.evaluate(column_name) values = compare_direct.evaluate(column_name) self.assertEqual(sorted(values), sorted(values_ui))
def stat_main(argv): parser = make_stat_parser(argv[0]) args = parser.parse_args(argv[1:]) import vaex dataset = vaex.open(args.dataset) if dataset is None: print("Cannot open input: %s" % args.dataset) sys.exit(1) print("dataset:") print(" length: %s" % len(dataset)) print(" full_length: %s" % dataset.full_length()) print(" name: %s" % dataset.name) print(" path: %s" % dataset.path) print(" columns: ") desc = dataset.description if desc: print(" description: %s" % desc) for name in dataset.get_column_names(): print(" - %s: " % name) desc = dataset.descriptions.get(name) if desc: print(" \tdescription: %s" % desc) unit = dataset.unit(name) if unit: print(" \tunit: %s" % unit) dtype = dataset.dtype(name) print(" \ttype: %s" % dtype.name)
def fetch(self): ds = self.fetch_multi() if len(self.filenames) > 1: if not os.path.exists(self.filename_single): ds.export_hdf5(self.filename_single) ds = vx.open(self.filename_single) return ds
def main(argv=sys.argv): dataset = vaex.open(argv[1]) app = QtGui.QApplication(argv) table = VariablesTable(None) table.set_dataset(dataset) table.show() table.raise_() sys.exit(app.exec_())
def test_open(): path = 'tests/data/gassphere_littleendian.dat' path = 'tests/data/galaxy_littleendian.dat' #ds = vaex.file.other.MemoryMappedGadget(path) ds = vaex.open(path) assert ds is not None assert not np.isnan(ds.x.min()) print(ds.x.minmax())
def main(argv): global subspace, dataset, limits parser = argparse.ArgumentParser(argv[0]) parser.add_argument("-N", help="run each batch N times (default: %(default)s)", type=int, default=5) parser.add_argument("-f", "--fraction", help="fraction of dataset to use (default: %(default)s)", default=1., type=float) parser.add_argument("-b", "--buffer-size", default=vx.execution.buffer_size_default, help="buffer size per thread (default: %(default)s)", type=int) parser.add_argument("-r", "--repeat", default=5, help="repeat benchmark X times (default: %(default)s)", type=int) parser.add_argument("-c", "--cpu-count", default=vx.multithreading.thread_count_default, help="thread count/cpu count (default: %(default)s)", type=int) parser.add_argument("filename", help="filename of dataset to use") parser.add_argument("expressions", help="list of expressions to export (or all when empty)", nargs="*") args = parser.parse_args(argv[1:]) vx.execution.buffer_size_default = args.buffer_size vx.multithreading.thread_count_default = args.cpu_count progressbar = False fn = args.filename # print(("opening", fn)) dataset = vx.open(fn) dataset.set_active_fraction(args.fraction) # dataset = vx.open_many(fn) expressions = args.expressions # print "subspace", expressions subspace = dataset(*expressions) itemsize = dataset.columns[expressions[0]].dtype.itemsize byte_size = len(dataset) * len(expressions) * itemsize byte_size1 = len(dataset) * itemsize # sums = subspace.sum() limits = subspace.minmax() print(limits) N = args.N # print("benchmarking minmax") # expr = "subspace.minmax()" # expr = "dataset.count('{}')".format(expressions[0]) # times = timeit.repeat(expr, setup="from vaex.benchmark import subspace, dataset, np", repeat=args.repeat, number=N) # print("minimum time", min(times) / N) # bandwidth = [byte_size1 / 1024.**3 / (time / N) for time in times] # print("%f GiB/s" % max(bandwidth)) # speed = [len(dataset) / (time / N) / 1e9 for time in times] # print("%f billion rows/s " % max(speed)) print() print("benchmarking histogram") expr = "dataset.count(binby=['{}', '{}'], limits=limits)".format(*expressions) times = timeit.repeat(expr, setup="from vaex.benchmark import subspace, dataset, np, limits", repeat=args.repeat, number=N) print("minimum time", min(times) / N) bandwidth = [byte_size / 1024.**3 / (time / N) for time in times] print("%f GiB/s" % max(bandwidth)) speed = [len(dataset) / (time / N) / 1e9 for time in times] print("%f billion rows/s " % max(speed)) print()
def test_string_operations_from_mmap_file(tmpdir): # if we write the file to disk and mmap it read only, we trigger invalid memory writes # see https://github.com/vaexio/vaex/pull/459 x = np.arange(5) y = np.array(['This', 'is', 'a', None, 'test']) df = vaex.from_arrays(x=x, y=y) filename = str(tmpdir / 'test.hdf5') df.export_hdf5(filename) df_from_file = vaex.open(filename) assert df_from_file.y.str.slice(start=0, stop=2).tolist() == ['Th', 'is', 'a', None, 'te'] assert df_from_file.y.str.upper().tolist() == ['THIS', 'IS', 'A', None, 'TEST']
def test_dtype_unicode_string(tmpdir): # CHANGE: before vaex v4 we worked with unicode, now we lazily cast to arrow x = np.arange(8, 12) s = np.array(list(map(str, x)), dtype='U') df = vaex.from_arrays(x=x, s=s) assert df.columns['s'].type == pa.string() path = str(tmpdir.join('test.arrow')) df.export(path) df_read = vaex.open(path, as_numpy=False) # the data type of x is different (arrow vs numpy) assert df_read.compare(df) == ([], [], ['x'], [])
def main(): logger.info(f'Starting...') logger.info(f'CSV Stored Size: {CSV_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB') df = vaex.open(str(CSV_FILE_PATH), convert=str(HDF_FILE_PATH)) logger.info(f'HDF5 Stored Size: {HDF_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB') df.export(str(ARROW_FILE_PATH)) logger.info(f'ARROW Stored Size: {ARROW_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB') logger.info(f'Finished!')
def df_parquet_cache(scope="session"): df = create_base_ds() df.drop('obj', inplace=True) df.drop('timedelta', inplace=True) df.drop('z') path = HERE / 'data' / 'unittest.parquet' pyarrow.parquet.write_table(df.to_arrow_table(), str(path), row_group_size=2) df = vaex.open(str(path)) df.select('(x >= 0) & (x < 10)', name=vaex.dataframe.FILTER_SELECTION_NAME) df.add_virtual_column("z", "x+t*y") df.set_variable("t", 1.) return df
def split(dir): data = vaex.open(dir, convert=True).to_pandas_df() cv = KFold(n_splits=5) i = 1 for t, v in cv.split(data): print(f"{i}th split processing...") train = data.iloc[t] train.to_csv(f'./train_{i}.csv') validation = data.iloc[v] validation.to_csv(f'./validation_{i}.csv') i += 1
def test_concat(df_file, tmpdir): path = tmpdir / 'test2.hdf5' df_file[['x']].export(path) df_concat = vaex.open(path) df = vaex.concat([df_file, df_concat]) assert len(pickle.dumps(df)) < 2000 df2 = pickle.loads(pickle.dumps(df)) assert len(df) == len(df_file) * 2 assert len(df2) == len(df_file) * 2 # assert df.compare(df2) == ([], [], [], []) assert df2.x.count() == len(df_file) * 2, 'x is repeated' assert df2.x.sum() == df_file.x.sum() * 2, 'x is repeated' assert df2.y.sum() == df_file.y.sum(), 'y is not repeated'
def test_cloud_dataset_masked(base_url, file_format, cache): # For now, caching of arrow & parquet is not supported kwargs = {} if file_format == 'csv': kwargs = dict(dtype={'x': 'Int64', 'y': 'Int64', 's': 'string'}) df = vaex.open(f'{base_url}/testing/xys-masked.{file_format}?cache={cache}', fs_options=fs_options, **kwargs) assert df.x.tolist() == [1, None] assert df.y.tolist() == [None, 4] assert df.s.tolist() == ['5', None] assert df.x.count() == 1 assert df.s.count() == 1 assert df.x.sum() == 1
def prepareData(filename): df = vaex.open(filename) Class = df.relevance.unique() # 目标变量的类别字典 Class_dict = dict(zip(Class, range(len(Class)))) X = df.copy().drop(['relevance']) X = X encoder = vaex.ml.OneHotEncoder(features=['relevance']) df = encoder.fit_transform(df) y = df[df.get_column_names(regex=r'relevance_.*')] train_x, test_x, train_y, test_y = train_test_split(X.to_pandas_df(), y.to_pandas_df(), \ test_size = 0.3, random_state = 5) return train_x, test_x, train_y, test_y, Class_dict
def test_add_invalid_name(tmpdir): # support invalid names and keywords df = vaex.from_dict({'X!1': x, 'class': x * 2}) assert df['X!1'].tolist() == x.tolist() assert (df['X!1'] * 2).tolist() == (x * 2).tolist() assert (df['class']).tolist() == (x * 2).tolist() assert 'X!1' in df._column_aliases assert (df.copy()['X!1'] * 2).tolist() == (x * 2).tolist() path = str(tmpdir.join('test.hdf5')) df.export(path) df = vaex.open(path) assert df['X!1'].tolist() == x.tolist() assert (df.copy()['X!1'] * 2).tolist() == (x * 2).tolist()
def setup(self, N): self.df = vaex.open(generate_numerical())[:N] self.df.categorize(self.df.i8_10, min_value=5, max_value=15, inplace=True) self.df.categorize(self.df.i4_10, min_value=5, max_value=15, inplace=True) self.df.categorize(self.df.i2_10, min_value=5, max_value=15, inplace=True) self.df.categorize(self.df.i1_10, min_value=5, max_value=15, inplace=True) self.df.categorize(self.df.i8_1K, min_value=5, max_value=1_000+5, inplace=True) self.df.categorize(self.df.i4_1K, min_value=5, max_value=1_000+5, inplace=True) self.df.categorize(self.df.i2_1K, min_value=5, max_value=1_000+5, inplace=True) # self.df.categorize(self.df.i1_1K, min_value=5, max_value=1_000+5) self.df.categorize(self.df.i8_1M, min_value=5, max_value=1_000_000+5, inplace=True) self.df.categorize(self.df.i4_1M, min_value=5, max_value=1_000_000+5, inplace=True)
def open_main(argv): import argparse parser = argparse.ArgumentParser(argv[0]) parser.add_argument('--verbose', '-v', default=False, action='store_true', help="give extra output") parser.add_argument('--quiet', '-q', default=False, action='store_true', help="do not output anything") parser.add_argument('--dry-run', '-n', default=False, action='store_true', help="do not actually execute commands (like delete)") parser.add_argument('--delete', help="Delete file when reading fails", default=False, action='store_true') parser.add_argument("input", help="list of files to try to open", nargs="*") args = parser.parse_args(argv[1:]) import vaex import vaex.file failed = False if args.verbose: print(f"Checking files {', '.join(args.input)}") for path in args.input: try: vaex.open(path) except BaseException as e: failed = True if not args.quiet: print(e) if args.delete: if not args.quiet: print(f'rm {path}') if not args.dry_run: try: vaex.file.remove(path) except FileNotFoundError: pass if args.verbose: if failed: print("Oops, had issues opening some files") else: print("All files could be opened") return 123 if failed else 0
def test_open_several_medium_csv_convert(): csv_glob = '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-01_*.csv' for path in glob.glob(csv_glob): os.remove(path + '.hdf5') os.remove( '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-01_0.csv_and_3_more.hdf5' ) start = datetime.now() df = vaex.open(csv_glob, convert=True) duration = datetime.now() - start print('it took {} to convert {:,} rows, which is {:,} rows per second'. format(duration, df.length(), int(df.length() / duration.total_seconds()))) assert df.length() == 3_999_999
def test_invalid_name_read(tmpdir): # earlier version of vaex could write invalid names, check if we can read those df = vaex.from_dict({'x': x}) # df.columns['1'] = df.columns.pop('x') # df.column_names = ['1'] path = str(tmpdir.join('test.hdf5')) df.export(path) h5 = h5py.File(path) h5['/table/columns']['1'] = h5['/table/columns']['x'] del h5['/table/columns']['x'] df = vaex.open(path) assert df['1'].tolist() == x.tolist() assert (df.copy()['1'] * 2).tolist() == (x * 2).tolist()
def test_categorical(tmpdir): # based on https://github.com/vaexio/vaex/issues/399 path = str(tmpdir.join('test.arrow')) table = pa.Table.from_pandas(pdf) with pa.OSFile(path, 'wb') as sink: with pa.RecordBatchStreamWriter(sink, table.schema) as writer: writer.write_table(table) with pa.OSFile(path, 'rb') as source: pdf2 = pa.ipc.open_stream(source).read_pandas() df = vaex.open(path) assert df.col1.tolist() == ["DEF"] assert df.is_category(df.col1) assert df.category_labels(df.col1) == ['ABC', 'DEF']
def load_titanic(): ''' Returns the classic Titanic dataset. Description of the columns can be found in dataset.description. Example: ======== >>> import vaex.ml >>> df = vaex.mk.dataset.load_titanic() >>> print(df.description) >>> df.describe() ''' dirname = os.path.dirname(__file__) return vaex.open(os.path.join(dirname, 'titanic.hdf5'))
def test_partitioning_basics_hive(): shutil.rmtree(data_path / 'parquet_dataset_partitioned_hive', ignore_errors=True) pq.write_to_dataset(table, data_path / 'parquet_dataset_partitioned_hive', partition_cols=['year', 'country']) ds = pa.dataset.dataset(data_path / 'parquet_dataset_partitioned_hive', partitioning="hive") #, format="parquet", ) # import pdb; pdb.set_trace() df = vaex.open(data_path / 'parquet_dataset_partitioned_hive', partitioning="hive") # import pdb; pdb.set_trace() assert set(df.value.tolist()) == set(values) assert set(df.year.tolist()) == set(years) assert set(df.country.tolist()) == set(countries)
def doCalculate(filename): df = vaex.open(f'{filename}.arrow') startTime = int(time.time()) # df = vaex.open('MSLR-WEB10K/Fold1/train.arrow') qidList = df.qid.unique() ndcgList = [] for qid in qidList: df.select(df.qid == qid) true_relevance = df.evaluate(df.relevance, selection=True) if len(true_relevance) < 2: ndcgList.append(1) continue scores = np.asarray(range(0, len(true_relevance))[: :-1]) ndcg = ndcg_score(np.asarray([true_relevance]), np.asarray([scores])) ndcgList.append(ndcg) endTime = int(time.time()) print(f'time used {endTime-startTime} sec') print(np.array(ndcgList).mean())
def read_dataframe_from_file(path: str) -> Optional[vaex.dataframe.DataFrame]: """Only read dataframe present in data/processed. Args: path: path relative to data/processed. Returns: vaex dataframe. """ path_to_processed_data = os.path.join(get_base_data_path(), "processed", path) file_path = f"{path_to_processed_data}.arrow" if not os.path.exists(file_path): raise OSError else: return vaex.open(file_path)
def test_open_two_big_csv_convert(): big_and_biggest_csv = '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-h1*.csv' os.remove( '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-h1_01.csv.hdf5' ) os.remove( '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-h1.csv.hdf5' ) os.remove( '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-h1.csv_and_1_more.hdf5' ) start = datetime.now() df = vaex.open(big_and_biggest_csv, convert=True) duration = datetime.now() - start print('it took {} to convert {:,} rows, which is {:,} rows per second'. format(duration, df.length(), int(df.length() / duration.total_seconds())))
def _from_csv_convert_and_read(filename_or_buffer, path_output, chunk_size, fs_options, fs=None, copy_index=False, progress=None, **kwargs): # figure out the CSV file path csv_path = vaex.file.stringyfy(filename_or_buffer) path_output_bare, ext, _ = vaex.file.split_ext(path_output) combined_hdf5 = _convert_name(csv_path) # convert CSV chunks to separate HDF5 files import pandas as pd converted_paths = [] # we don't have indeterminate progress bars, so we cast it to truethy progress = bool(progress) if progress is not None else False if progress: print("Converting csv to chunk files") with vaex.file.open(filename_or_buffer, fs_options=fs_options, fs=fs, for_arrow=True) as f: csv_reader = pd.read_csv(filename_or_buffer, chunksize=chunk_size, **kwargs) for i, df_pandas in enumerate(csv_reader): df = vaex.from_pandas(df_pandas, copy_index=copy_index) chunk_name = f'{path_output_bare}_chunk_{i}{ext}' df.export(chunk_name) converted_paths.append(chunk_name) log.info('saved chunk #%d to %s' % (i, chunk_name)) if progress: print("Saved chunk #%d to %s" % (i, chunk_name)) # combine chunks into one HDF5 file if len(converted_paths) == 1: # no need to merge several HDF5 files os.rename(converted_paths[0], path_output) else: if progress: print('Converting %d chunks into single file %s' % (len(converted_paths), path_output)) log.info('converting %d chunks into single file %s' % (len(converted_paths), path_output)) dfs = [vaex.open(p) for p in converted_paths] df_combined = vaex.concat(dfs) df_combined.export(path_output, progress=progress) log.info('deleting %d chunk files' % len(converted_paths)) for df, df_path in zip(dfs, converted_paths): try: df.close() os.remove(df_path) except Exception as e: log.error('Could not close or delete intermediate file %s used to convert %s to single file: %s', (df_path, csv_path, path_output))
def __init__(self, path2hdf, path2tdf): """ Args: path2hdf (str): Path to the hdf5 stored timsTOF dataset. path2tdf (str): Path to the 'analysis.tdf' sqlite3 DB. """ import vaex self.path2hdf = pathlib.Path(path2hdf) self.path2tdf = pathlib.Path(path2tdf) self.df = vaex.open(str(self.path2hdf)) self.columns = tuple(self.df.columns) self.frames = self.table2df('frames') self.min_frame = self.frames.Id.min() self.max_frame = self.frames.Id.max() self.frames_no = self.max_frame-self.min_frame+1 self._ms1_mask = self.frames.MsMsType.values == 0 self.ms1_frames = self.frames.Id[self._ms1_mask].values self.retention_time = self.frames.Time
def test_pick_file(tmpdir, file_extension): x = np.arange(N_rows, dtype='i8') df = vaex.from_arrays(x=x, x2=-x) df['y'] = df.x**2 data = pickle.dumps(df) # if the data is in memory, pickle will be large assert len(data) > len(x) * x.itemsize xsum = df.x.sum() ysum = df.y.sum() # but on disk, it should just pickle the file path # TODO: arrow is not supported yet for ext in 'hdf5 parquet'.split(): path = tmpdir / f'test.{ext}' df.export(path) df = vaex.open(path) data = pickle.dumps(df) assert len(data) < 1000 assert df.x.sum() == xsum assert df.y.sum() == ysum
def test_add_invalid_name(tmpdir): # support invalid names and keywords df = vaex.from_dict({'X!1': x, 'class': x*2}) assert df.get_column_names() == ['X!1', 'class'] assert df.get_column_names(alias=False) != ['X!1', 'class'] assert df['X!1'].tolist() == x.tolist() assert (df['X!1']*2).tolist() == (x*2).tolist() assert (df['class']).tolist() == (x*2).tolist() assert 'X!1' in df._column_aliases assert (df.copy()['X!1']*2).tolist() == (x*2).tolist() path = str(tmpdir.join('test.hdf5')) df.export(path) df = vaex.open(path) assert df['X!1'].tolist() == x.tolist() assert (df.copy()['X!1']*2).tolist() == (x*2).tolist() assert (df[['X!1']]['X!1']*2).tolist() == (x*2).tolist() df_concat = vaex.concat([df, df]) assert (df_concat[['X!1']]['X!1']*2).tolist() == ((x*2).tolist() + (x*2).tolist())
def test_add_invalid_name(tmpdir): # support invalid names and keywords df = vaex.from_dict({'X!1': x, 'class': x*2}) assert str(df['X!1']) != 'X!1', "invalid identifier cannot be an expression" assert str(df['class']) != 'class', "keyword cannot be an expression" assert df.get_column_names() == ['X!1', 'class'] assert df['X!1'].tolist() == x.tolist() assert (df['X!1']*2).tolist() == (x*2).tolist() assert (df['class']).tolist() == (x*2).tolist() assert (df.copy()['X!1']*2).tolist() == (x*2).tolist() path = str(tmpdir.join('test.hdf5')) df.export(path) df = vaex.open(path) assert df['X!1'].tolist() == x.tolist() assert (df.copy()['X!1']*2).tolist() == (x*2).tolist() assert (df[['X!1']]['X!1']*2).tolist() == (x*2).tolist() df_concat = vaex.concat([df, df]) assert (df_concat[['X!1']]['X!1']*2).tolist() == ((x*2).tolist() + (x*2).tolist())
def main(argv): parser = argparse.ArgumentParser(argv[0]) parser.add_argument("filename", help="filename for dataset", nargs='*') parser.add_argument("--address", help="address to bind the server to (default: %(default)s)", default="0.0.0.0") parser.add_argument("--port", help="port to listen on (default: %(default)s)", type=int, default=9000) parser.add_argument('--verbose', '-v', action='count', default=2) parser.add_argument('--cache', help="cache size in bytes for requests, set to zero to disable (default: %(default)s)", type=int, default=500000000) parser.add_argument('--compress', help="compress larger replies (default: %(default)s)", default=True, action='store_true') parser.add_argument('--no-compress', dest="compress", action='store_false') parser.add_argument('--development', default=False, action='store_true', help="enable development features (auto reloading)") parser.add_argument('--threads-per-job', default=4, type=int, help="threads per job (default: %(default)s)") # config = layeredconfig.LayeredConfig(defaults, env, layeredconfig.Commandline(parser=parser, commandline=argv[1:])) config = parser.parse_args(argv[1:]) verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"] logging.getLogger("vaex").setLevel(verbosity[config.verbose]) # import vaex # vaex.set_log_level_debug() from vaex.settings import webserver as settings # filenames = config.filenames filenames = [] filenames = config.filename datasets = [] for filename in filenames: ds = vx.open(filename) if ds is None: print("error opening file: %r" % filename) else: datasets.append(ds) datasets = datasets or [vx.example()] # datasets = [ds for ds in datasets if ds is not None] logger.info("datasets:") for dataset in datasets: logger.info("\thttp://%s:%d/%s or ws://%s:%d/%s", config.address, config.port, dataset.name, config.address, config.port, dataset.name) server = WebServer(datasets=datasets, address=config.address, port=config.port, cache_byte_size=config.cache, compress=config.compress, development=config.development, threads_per_job=config.threads_per_job) server.serve()
def main(argv): parser = argparse.ArgumentParser(argv[0]) parser.add_argument("filename", help="filename for dataset", nargs='*') parser.add_argument("--address", help="address to bind the server to (default: %(default)s)", default="0.0.0.0") parser.add_argument("--port", help="port to listen on (default: %(default)s)", type=int, default=9000) parser.add_argument('--verbose', '-v', action='count', default=2) parser.add_argument('--cache', help="cache size in bytes for requests, set to zero to disable (default: %(default)s)", type=int, default=500000000) parser.add_argument('--compress', help="compress larger replies (default: %(default)s)", default=True, action='store_true') parser.add_argument('--no-compress', dest="compress", action='store_false') parser.add_argument('--development', default=False, action='store_true', help="enable development features (auto reloading)") parser.add_argument('--threads-per-job', default=4, type=int, help="threads per job (default: %(default)s)") #config = layeredconfig.LayeredConfig(defaults, env, layeredconfig.Commandline(parser=parser, commandline=argv[1:])) config = parser.parse_args(argv[1:]) verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"] logging.getLogger("vaex").setLevel(verbosity[config.verbose]) #import vaex #vaex.set_log_level_debug() from vaex.settings import webserver as settings #filenames = config.filenames filenames = [] filenames = config.filename datasets = [] for filename in filenames: ds = vx.open(filename) if ds is None: print("error opening file: %r" % filename) else: datasets.append(ds) datasets = datasets or [vx.example()] #datasets = [ds for ds in datasets if ds is not None] logger.info("datasets:") for dataset in datasets: logger.info("\thttp://%s:%d/%s or ws://%s:%d/%s", config.address, config.port, dataset.name, config.address, config.port, dataset.name) server = WebServer(datasets=datasets, address=config.address, port=config.port, cache_byte_size=config.cache, compress=config.compress, development=config.development, threads_per_job=config.threads_per_job) server.serve()
def load_iris(): '''Load and return the iris dataset (classification). The iris dataset is a classic and very easy multi-class classification dataset. ================= ============== Classes 3 Samples per class 50 Samples total 150 Dimensionality 4 Features real, positive ================= ============== Example: ======== >>> import vaex.ml >>> df = vaex.ml.datasets.load_iris() >>> df.describe() ''' dirname = os.path.dirname(__file__) return vaex.open(os.path.join(dirname, 'iris.hdf5'))
def binary_to_df(file: str, filePattern: str): """Convert any binary formats into vaex dataframe Args: file (str): Path to input file. filePattern (str): extension of file to convert. Returns: Vaex dataframe. """ binary_patterns = [".*.fits", ".*.arrow", ".*.parquet", ".*.hdf5"] logger.info("binary_to_df: Scanning directory for binary file pattern... ") if filePattern in binary_patterns: # convert hdf5 to vaex df df = vaex.open(file) return df else: raise FileNotFoundError( "No supported binary file extensions were found in the directory. Please check file directory again." )
def create_scats_ml_model(): print("starting scats ml modeling") # load existing csv into vaex dataframe if not os.path.exists(finalScatsPath + ".hdf5"): vaex.from_csv(finalScatsPath, convert=True, copy_index=False, chunk_size=1_000_000) df = vaex.open(finalScatsPath + ".hdf5", shuffle=True) # transform the features into more machine learning friendly vars pca_coord = vaex.ml.PCA(features=["lat", "lon"], n_components=2, prefix="pca") df = pca_coord.fit_transform(df) cycl_transform_hour = vaex.ml.CycleTransformer(features=["hour"], n=24) df = cycl_transform_hour.fit_transform(df) cycl_transform_dow = vaex.ml.CycleTransformer(features=["dow"], n=7) df = cycl_transform_dow.fit_transform(df) print("dataWrangling done, ready to create model, time: {}s".format(duration())) # create a randomForestRegression model vaex_model = Predictor( features=df.get_column_names(regex="pca[\d]") + df.get_column_names(regex=".*_[xy]"), target="avg_vol", model=RandomForestRegressor(random_state=42, n_estimators=7 * 24), prediction_name="p_avg_vol", ) # here we fit and train the model with parallel_backend("threading", n_jobs=8): vaex_model.fit(df) print("\n\nmodel created, time: {}s".format(duration())) dump(value=vaex_model, filename=model_out, compress=3) print("model written to output, time: {}s".format(duration())) return
def main(): print(f'HDF5 Stored Size: {CSV_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB') stream = pd.read_csv( CSV_FILE_PATH, chunksize=CHUNK_SIZE, low_memory=False, sep=',', encoding='latin-1', ) TMP_PATH.mkdir(parents=True, exist_ok=True) for i, chunk in enumerate(stream): print(f'Processing {i + 1}-th chunk containing "{len(chunk)}" rows of data...') df_chunk = vaex.from_pandas(chunk, copy_index=False) export_path = TMP_PATH / f'part_{i}.hdf5' df_chunk.export_hdf5(str(export_path)) df = vaex.open(str(TMP_PATH / 'part*')) df.export_hdf5(str(COLUMNAR_HDF_FILE_PATH)) print(f'HDF5 Stored Size: {COLUMNAR_HDF_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB') rmtree(TMP_PATH)
def vaex_read(fn): import vaex df = vaex.open(fn) print(df.schema()) print(df)
parser.add_argument('--npandas', dest="npandas", type=float, default=7, help="number of rows to use for pandas") parser.add_argument('--filter', dest="filter", default=None, help="filter for benchmark") parser.add_argument('--filename', default=default_filename, help='filename to use for benchmark export/reading') args = parser.parse_args(argv[1:]) use_dask = False if not os.path.exists(args.filename): x = np.arange(0, int(10**args.nmax)) xs = x.astype(str) s = xs#vaex.string_column(xs) df_vaex = vaex.from_arrays(x=s, s=s) df_vaex.export(args.filename, progress=True, shuffle=True) df = vaex.open(args.filename) df_vaex = df[0:int(10**args.n)] df_vaex.executor.buffer_size = len(df_vaex)//args.partitions df_pandas = df[:int(10**args.npandas)].to_pandas_df() if use_dask: df_dask = dd.from_pandas(df_pandas, npartitions=4) timings = {} def mytimeit(expr, N, scope): times = [] for i in range(N): t0 = time.time() eval(expr, scope) times.append(time.time() - t0) return times def test(name, expr):
def main(argv): import argparse parser = argparse.ArgumentParser(argv[0]) parser.add_argument('--verbose', '-v', action='count', default=0) parser.add_argument('--list', '-l', default=False, action='store_true', help="list columns of input") subparsers = parser.add_subparsers(help='type of subtask', dest="task") parser_export = subparsers.add_parser('export', help='read meta info') parser_export.add_argument("input", help="input dataset") parser_export.add_argument('output', help='output file (.yaml or .json)') parser_export.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*") parser_export.add_argument('--all', dest="all", action='store_true', default=False, help="Also export missing values (useful for having a template)") parser_import = subparsers.add_parser('import', help='read meta info') parser_import.add_argument('input', help='input meta file (.yaml or .json)') parser_import.add_argument("output", help="output dataset") parser_import.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*") parser_import.add_argument('--overwrite', help="overwrite existing entries", default=False, action='store_true') parser_import.add_argument('--description', help="overwrite description", default=None) args = parser.parse_args(argv[1:]) verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"] logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)]) if args.task == "export": ds = vaex.open(args.input) column_names = ds.get_column_names(strings=True, virtual=True) if args.all: output_data = dict(description=ds.description, descriptions={name: ds.descriptions.get(name, "") for name in column_names}, ucds={name: ds.ucds.get(name, "") for name in column_names}, units={name: str(ds.units.get(name, "")) for name in column_names}, # {name:str(unit) for name, unit in ds.units.items()}, ) else: output_data = dict(description=ds.description, descriptions=ds.descriptions, ucds=ds.ucds, units={name: str(unit) for name, unit in ds.units.items()}, ) if args.output == "-": yaml.safe_dump(output_data, sys.stdout, default_flow_style=False) # , encoding='utf-8', allow_unicode=True) else: vaex.utils.write_json_or_yaml(args.output, output_data) print("wrote %s" % args.output) if args.task == "import": if args.input == "-": data = yaml.load(sys.stdin) else: data = vaex.utils.read_json_or_yaml(args.input) ds = vaex.open(args.output) units = data["units"] ucds = data["ucds"] descriptions = data["descriptions"] if args.description: ds.description = args.description else: if ds.description is None or args.overwrite: ds.description = data["description"] for column_name in ds.get_column_names(strings=True): if column_name not in descriptions: print(column_name, 'missing description') else: print('>>>', column_name, descriptions[column_name]) if (args.overwrite or column_name not in ds.units) and column_name in units: ds.units[column_name] = astropy.units.Unit(units[column_name]) if (args.overwrite or column_name not in ds.ucds) and column_name in ucds: ds.ucds[column_name] = ucds[column_name] if (args.overwrite or column_name not in ds.descriptions) and column_name in descriptions: ds.descriptions[column_name] = descriptions[column_name] ds.write_meta() print("updated meta data in %s" % args.output)
def fetch(self, force_download=False): self.download(force=force_download) return vx.open(self.filename)
def open(self): return vx.open_many(self.filenames_vaex) if len(self.filenames_vaex) != 1 else vx.open(self.filenames_vaex[0])
def main(argv): import argparse parser = argparse.ArgumentParser(argv[0]) parser.add_argument('--verbose', '-v', action='count', default=0) parser.add_argument('--quiet', '-q', default=False, action='store_true', help="do not output anything") parser.add_argument('--list', '-l', default=False, action='store_true', help="list columns of input") parser.add_argument('--progress', help="show progress (default: %(default)s)", default=True, action='store_true') parser.add_argument('--no-progress', dest="progress", action='store_false') parser.add_argument('--shuffle', "-s", dest="shuffle", action='store_true', default=False) parser.add_argument('--sort', dest="sort", default=None) parser.add_argument('--virtual', dest="virtual", action='store_true', default=False, help="Also export virtual columns") parser.add_argument('--fraction', "-f", dest="fraction", type=float, default=1.0, help="fraction of input dataset to export") parser.add_argument('--filter', dest="filter", default=None, help="filter to apply before exporting") subparsers = parser.add_subparsers(help='type of input source', dest="task") parser_soneira = subparsers.add_parser('soneira', help='create soneira peebles dataset') parser_soneira.add_argument('output', help='output file') parser_soneira.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*") parser_soneira.add_argument('--dimension', '-d', type=int, help='dimensions', default=4) # parser_soneira.add_argument('--eta','-e', type=int, help='dimensions', default=3) parser_soneira.add_argument('--max-level', '-m', type=int, help='dimensions', default=28) parser_soneira.add_argument('--lambdas', '-l', type=int, help='lambda values for fractal', default=[1.1, 1.3, 1.6, 2.]) parser_tap = subparsers.add_parser('tap', help='use TAP (Table Access Protocol) as source') parser_tap.add_argument("tap_url", help="input source or file") parser_tap.add_argument("table_name", help="input source or file") parser_tap.add_argument("output", help="output file (ends in .fits or .hdf5)") parser_tap.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*") parser_file = subparsers.add_parser('file', help='use a file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)') parser_file.add_argument("input", help="input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)") parser_file.add_argument("output", help="output file (ends in .fits or .hdf5)") parser_file.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*") parser_file = subparsers.add_parser('csv', help='use a csv file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)') parser_file.add_argument("input", help="input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)") parser_file.add_argument("output", help="output file (ends in .hdf5)") parser_file.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*") args = parser.parse_args(argv[1:]) verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"] logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)]) dataset = None if args.task == "soneira": if vaex.utils.check_memory_usage(4 * 8 * 2**args.max_level, vaex.utils.confirm_on_console): if not args.quiet: print("generating soneira peebles dataset...") dataset = vaex.file.other.SoneiraPeebles(args.dimension, 2, args.max_level, args.lambdas) else: return 1 if args.task == "tap": dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name) if not args.quiet: print("exporting from {tap_url} table name {table_name} to {output}".format(tap_url=args.tap_url, table_name=args.table_name, output=args.output)) if args.task == "csv": # dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name) if not args.quiet: print("exporting from {input} to {output}".format(input=args.input, output=args.output)) if args.task == "file": if args.input[0] == "@": inputs = open(args.input[1:]).readlines() dataset = vaex.open_many(inputs) else: dataset = vaex.open(args.input) if not args.quiet: print("exporting from {input} to {output}".format(input=args.input, output=args.output)) if dataset is None and args.task not in ["csv"]: if not args.quiet: print("Cannot open input") return 1 if dataset: dataset.set_active_fraction(args.fraction) if args.list: if not args.quiet: print("columns names: " + " ".join(dataset.get_column_names())) else: if args.task == "csv": row_count = -1 # the header does not count with file(args.input) as lines: for line in lines: row_count += 1 # print line logger.debug("row_count: %d", row_count) with file(args.input) as lines: line = next(lines).strip() # print line names = line.strip().split(",") line = next(lines).strip() values = line.strip().split(",") numerics = [] for value in values: try: float(value) numerics.append(True) except: numerics.append(False) names_numeric = [name for name, numeric in zip(names, numerics) if numeric] print(names_numeric) output = vaex.file.other.Hdf5MemoryMapped.create(args.output, row_count, names_numeric) Ncols = len(names) cols = [output.columns[name] if numeric else None for name, numeric in zip(names, numerics)] def copy(line, row_index): values = line.strip().split(",") for column_index in range(Ncols): if numerics[column_index]: value = float(values[column_index]) cols[column_index][row_index] = value row = 0 copy(line, row) row += 1 progressbar = vaex.utils.progressbar(title="exporting") if args.progress else None for line in lines: # print line copy(line, row) row += 1 if row % 1000: progressbar.update(row / float(row_count)) progressbar.finish() # print names else: if args.columns: columns = args.columns else: columns = None if columns is None: columns = dataset.get_column_names(strings=True, virtual=args.virtual) for column in columns: if column not in dataset.get_column_names(strings=True, virtual=True): if not args.quiet: print("column %r does not exist, run with --list or -l to list all columns" % column) return 1 base, output_ext = os.path.splitext(args.output) if output_ext not in [".hdf5", ".fits", ".arrow"]: if not args.quiet: print("extension %s not supported, only .hdf5, .arrow and .fits are" % output_ext) return 1 if not args.quiet: print("exporting %d rows and %d columns" % (len(dataset), len(columns))) print("columns: " + " ".join(columns)) progressbar = vaex.utils.progressbar(title="exporting") if args.progress else None def update(p): if progressbar: progressbar.update(p) return True if args.filter: dataset.select(args.filter, name='export') selection = 'export' else: selection = None if output_ext == ".hdf5": export_hdf5(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection) elif output_ext == ".arrow": from vaex_arrow.export import export as export_arrow export_arrow(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection) elif output_ext == ".fits": export_fits(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection) if progressbar: progressbar.finish() if not args.quiet: print("\noutput to %s" % os.path.abspath(args.output)) dataset.close_files() return 0
import vaex as vx import sys #import yappi #vx.set_log_level_debug() progressbar = True fn = sys.argv[1] print "opening", fn #dataset = vx.open_many([fn]) dataset = vx.open(fn) #dataset.set_active_fraction(0.5) expressions = tuple(sys.argv[2:]) if sys.argv[2] == "Alpha": dataset.add_virtual_columns_celestial("Alpha", "Delta", "l", "b") expressions = ("l", "b") for key, value in dataset.virtual_columns.items(): print key, value #dsa print "subspace", expressions subspace = dataset(*expressions) #print "calculate minmax" #yappi.start() limits = subspace.minmax(progressbar=progressbar) #print "calculate histogram" subspace.histogram(limits, progressbar=progressbar) #yappi.get_func_stats().print_all()
def test_open(): with pytest.raises(IOError): vaex.open('doesnotexist') csv1 = os.path.join(path, 'data', 'small2.csv') csv2 = os.path.join(path, 'data', 'small2.csv') h51 = os.path.join(path, 'data', 'small2.csv.hdf5') h52 = os.path.join(path, 'data', 'small3.csv.hdf5') target = os.path.join(path, 'data', 'small2.csv.hdf5') vaex.open(csv1, convert=True) assert os.path.exists(target) os.remove(target) target = os.path.join(path, 'data', 'small2.csv_and_1_more.hdf5') vaex.open(os.path.join(path, 'data', 'small*.csv'), convert=True) assert os.path.exists(target) assert os.path.exists(h51) assert os.path.exists(h52) vaex.open(os.path.join(path, 'data', 'small?.csv.hdf5')) os.remove(target) os.remove(h51) os.remove(h52) # convert can also be a path target = os.path.join(path, 'data', 'convert.hdf5') vaex.open(os.path.join(path, 'data', 'small*.csv'), convert=target) assert os.path.exists(target) assert os.path.exists(h51) assert os.path.exists(h52) vaex.open(os.path.join(path, 'data', 'small?.csv.hdf5')) os.remove(target) os.remove(h51) os.remove(h52) target = os.path.join('custom.hdf5') vaex.open(os.path.join(path, 'data', 'small*.csv'), convert=target) assert os.path.exists(h51) assert os.path.exists(h52) assert os.path.exists(target) os.remove(target) os.remove(h51) os.remove(h52)