Exemplo n.º 1
0
	def test_export(self):
		path_hdf5 = tempfile.mktemp(".hdf5")
		path_hdf5_ui = tempfile.mktemp(".hdf5")
		path_fits = tempfile.mktemp(".fits")
		path_fits_ui = tempfile.mktemp(".fits")

		for dataset in [self.dataset]:
			self.app.dataset_selector.add(dataset)
			for fraction in [1, 0.5]:
				dataset.set_active_fraction(fraction)
				dataset.select("x > 3")
				length = len(dataset)
				# TODO: gui doesn't export virtual columns, add "z" to this list
				for column_names in [["x", "y"], ["x"], ["y"]]:
					for byteorder in "=<>":
						for shuffle in [False, True]:
							for selection in [False, True]:
								for export in [dataset.export_fits, dataset.export_hdf5] if byteorder == ">" else [dataset.export_hdf5]:
									type = "hdf5" if export == dataset.export_hdf5 else "fits"
									if shuffle and selection:
										continue # TODO: export should fail on this combination
									#print column_names, byteorder, shuffle, selection, type
									if export == dataset.export_hdf5:
										path = path_hdf5
										path_ui = path_hdf5_ui
										export(path, column_names=column_names, byteorder=byteorder, shuffle=shuffle, selection=selection)
									else:
										path = path_fits
										path_ui = path_fits_ui
										export(path, column_names=column_names, shuffle=shuffle, selection=selection)
									compare_direct = vx.open(path)

									dialogs.set_choose(1 if selection else 0).then("=<>".index(byteorder))
									# select columns
									dialogs.set_select_many(True, [name in column_names for name in dataset.get_column_names()])
									counter_confirm = CallCounter(return_value=shuffle)
									counter_info = CallCounter()
									dialogs.dialog_confirm = counter_confirm
									dialogs.dialog_info = counter_info
									dialogs.get_path_save = lambda *args: path_ui
									dialogs.ProgressExecution = dialogs.FakeProgressExecution
									import sys
									sys.stdout.flush()

									self.app.export(type=type)
									compare_ui = vx.open(path_ui)

									column_names = column_names or ["x", "y", "z"]
									self.assertEqual(compare_direct.get_column_names(), compare_ui.get_column_names())
									for column_name in column_names:
										values_ui = compare_ui.evaluate(column_name)
										values = compare_direct.evaluate(column_name)
										self.assertEqual(sorted(values), sorted(values_ui))
Exemplo n.º 2
0
def stat_main(argv):
    parser = make_stat_parser(argv[0])
    args = parser.parse_args(argv[1:])
    import vaex
    dataset = vaex.open(args.dataset)
    if dataset is None:
        print("Cannot open input: %s" % args.dataset)
        sys.exit(1)
    print("dataset:")
    print("  length: %s" % len(dataset))
    print("  full_length: %s" % dataset.full_length())
    print("  name: %s" % dataset.name)
    print("  path: %s" % dataset.path)
    print("  columns: ")
    desc = dataset.description
    if desc:
        print("    description: %s" % desc)
    for name in dataset.get_column_names():
        print("   - %s: " % name)
        desc = dataset.descriptions.get(name)
        if desc:
            print("  \tdescription: %s" % desc)
        unit = dataset.unit(name)
        if unit:
            print("   \tunit: %s" % unit)
        dtype = dataset.dtype(name)
        print("   \ttype: %s" % dtype.name)
Exemplo n.º 3
0
 def fetch(self):
     ds = self.fetch_multi()
     if len(self.filenames) > 1:
         if not os.path.exists(self.filename_single):
             ds.export_hdf5(self.filename_single)
         ds = vx.open(self.filename_single)
     return ds
Exemplo n.º 4
0
def main(argv=sys.argv):
    dataset = vaex.open(argv[1])
    app = QtGui.QApplication(argv)
    table = VariablesTable(None)
    table.set_dataset(dataset)
    table.show()
    table.raise_()
    sys.exit(app.exec_())
Exemplo n.º 5
0
def test_open():
    path = 'tests/data/gassphere_littleendian.dat'
    path = 'tests/data/galaxy_littleendian.dat'
    #ds = vaex.file.other.MemoryMappedGadget(path)
    ds = vaex.open(path)
    assert ds is not None
    assert not np.isnan(ds.x.min())
    print(ds.x.minmax())
Exemplo n.º 6
0
def main(argv):
    global subspace, dataset, limits
    parser = argparse.ArgumentParser(argv[0])
    parser.add_argument("-N", help="run each batch N times (default: %(default)s)", type=int, default=5)
    parser.add_argument("-f", "--fraction", help="fraction of dataset to use (default: %(default)s)", default=1., type=float)
    parser.add_argument("-b", "--buffer-size", default=vx.execution.buffer_size_default, help="buffer size per thread (default: %(default)s)", type=int)
    parser.add_argument("-r", "--repeat", default=5, help="repeat benchmark X times (default: %(default)s)", type=int)
    parser.add_argument("-c", "--cpu-count", default=vx.multithreading.thread_count_default, help="thread count/cpu count (default: %(default)s)", type=int)
    parser.add_argument("filename", help="filename of dataset to use")
    parser.add_argument("expressions", help="list of expressions to export (or all when empty)", nargs="*")

    args = parser.parse_args(argv[1:])
    vx.execution.buffer_size_default = args.buffer_size
    vx.multithreading.thread_count_default = args.cpu_count

    progressbar = False
    fn = args.filename
    # print(("opening", fn))
    dataset = vx.open(fn)
    dataset.set_active_fraction(args.fraction)
    # dataset = vx.open_many(fn)

    expressions = args.expressions
    # print "subspace", expressions
    subspace = dataset(*expressions)
    itemsize = dataset.columns[expressions[0]].dtype.itemsize
    byte_size = len(dataset) * len(expressions) * itemsize
    byte_size1 = len(dataset) * itemsize
    # sums = subspace.sum()

    limits = subspace.minmax()
    print(limits)

    N = args.N
    # print("benchmarking minmax")
    # expr = "subspace.minmax()"
    # expr = "dataset.count('{}')".format(expressions[0])
    # times = timeit.repeat(expr, setup="from vaex.benchmark import subspace, dataset, np", repeat=args.repeat, number=N)
    # print("minimum time", min(times) / N)
    # bandwidth = [byte_size1 / 1024.**3 / (time / N) for time in times]
    # print("%f GiB/s" % max(bandwidth))

    # speed = [len(dataset) / (time / N) / 1e9 for time in times]
    # print("%f billion rows/s " % max(speed))

    print()
    print("benchmarking histogram")
    expr = "dataset.count(binby=['{}', '{}'], limits=limits)".format(*expressions)
    times = timeit.repeat(expr, setup="from vaex.benchmark import subspace, dataset, np, limits", repeat=args.repeat, number=N)
    print("minimum time", min(times) / N)
    bandwidth = [byte_size / 1024.**3 / (time / N) for time in times]
    print("%f GiB/s" % max(bandwidth))

    speed = [len(dataset) / (time / N) / 1e9 for time in times]
    print("%f billion rows/s " % max(speed))
    print()
Exemplo n.º 7
0
def test_string_operations_from_mmap_file(tmpdir):
    # if we write the file to disk and mmap it read only, we trigger invalid memory writes
    # see https://github.com/vaexio/vaex/pull/459
    x = np.arange(5)
    y = np.array(['This', 'is', 'a', None, 'test'])
    df = vaex.from_arrays(x=x, y=y)
    filename = str(tmpdir / 'test.hdf5')
    df.export_hdf5(filename)
    df_from_file = vaex.open(filename)
    assert df_from_file.y.str.slice(start=0, stop=2).tolist() == ['Th', 'is', 'a', None, 'te']
    assert df_from_file.y.str.upper().tolist() == ['THIS', 'IS', 'A', None, 'TEST']
Exemplo n.º 8
0
def test_dtype_unicode_string(tmpdir):
    # CHANGE: before vaex v4 we worked with unicode, now we lazily cast to arrow
    x = np.arange(8, 12)
    s = np.array(list(map(str, x)), dtype='U')
    df = vaex.from_arrays(x=x, s=s)
    assert df.columns['s'].type == pa.string()
    path = str(tmpdir.join('test.arrow'))
    df.export(path)
    df_read = vaex.open(path, as_numpy=False)
    # the data type of x is different (arrow vs numpy)
    assert df_read.compare(df) == ([], [], ['x'], [])
Exemplo n.º 9
0
def main():
    logger.info(f'Starting...')

    logger.info(f'CSV Stored Size: {CSV_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB')

    df = vaex.open(str(CSV_FILE_PATH), convert=str(HDF_FILE_PATH))
    logger.info(f'HDF5 Stored Size: {HDF_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB')

    df.export(str(ARROW_FILE_PATH))
    logger.info(f'ARROW Stored Size: {ARROW_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB')

    logger.info(f'Finished!')
Exemplo n.º 10
0
def df_parquet_cache(scope="session"):
    df = create_base_ds()
    df.drop('obj', inplace=True)
    df.drop('timedelta', inplace=True)
    df.drop('z')
    path = HERE / 'data' / 'unittest.parquet'
    pyarrow.parquet.write_table(df.to_arrow_table(), str(path), row_group_size=2)
    df = vaex.open(str(path))
    df.select('(x >= 0) & (x < 10)', name=vaex.dataframe.FILTER_SELECTION_NAME)
    df.add_virtual_column("z", "x+t*y")
    df.set_variable("t", 1.)
    return df
Exemplo n.º 11
0
def split(dir):
    data = vaex.open(dir, convert=True).to_pandas_df()

    cv = KFold(n_splits=5)
    i = 1
    for t, v in cv.split(data):
        print(f"{i}th split processing...")
        train = data.iloc[t]
        train.to_csv(f'./train_{i}.csv')
        validation = data.iloc[v]
        validation.to_csv(f'./validation_{i}.csv')
        i += 1
Exemplo n.º 12
0
def test_concat(df_file, tmpdir):
    path = tmpdir / 'test2.hdf5'
    df_file[['x']].export(path)
    df_concat = vaex.open(path)
    df = vaex.concat([df_file, df_concat])
    assert len(pickle.dumps(df)) < 2000
    df2 = pickle.loads(pickle.dumps(df))
    assert len(df) == len(df_file) * 2
    assert len(df2) == len(df_file) * 2
    # assert df.compare(df2) == ([], [], [], [])
    assert df2.x.count() == len(df_file) * 2, 'x is repeated'
    assert df2.x.sum() == df_file.x.sum() * 2, 'x is repeated'
    assert df2.y.sum() == df_file.y.sum(), 'y is not repeated'
Exemplo n.º 13
0
def test_cloud_dataset_masked(base_url, file_format, cache):
    # For now, caching of arrow & parquet is not supported
    kwargs = {}
    if file_format == 'csv':
        kwargs = dict(dtype={'x': 'Int64', 'y': 'Int64', 's': 'string'})
    df = vaex.open(f'{base_url}/testing/xys-masked.{file_format}?cache={cache}', fs_options=fs_options, **kwargs)
    assert df.x.tolist() == [1, None]
    assert df.y.tolist() == [None, 4]
    assert df.s.tolist() == ['5', None]

    assert df.x.count() == 1
    assert df.s.count() == 1
    assert df.x.sum() == 1
Exemplo n.º 14
0
def prepareData(filename):
    df = vaex.open(filename)
    Class = df.relevance.unique()
    # 目标变量的类别字典
    Class_dict = dict(zip(Class, range(len(Class))))
    X = df.copy().drop(['relevance'])
    X = X
    encoder = vaex.ml.OneHotEncoder(features=['relevance'])
    df = encoder.fit_transform(df)
    y = df[df.get_column_names(regex=r'relevance_.*')]
    train_x, test_x, train_y, test_y = train_test_split(X.to_pandas_df(), y.to_pandas_df(), \
                                                        test_size = 0.3, random_state = 5)
    return train_x, test_x, train_y, test_y, Class_dict
Exemplo n.º 15
0
def test_add_invalid_name(tmpdir):
    # support invalid names and keywords
    df = vaex.from_dict({'X!1': x, 'class': x * 2})
    assert df['X!1'].tolist() == x.tolist()
    assert (df['X!1'] * 2).tolist() == (x * 2).tolist()
    assert (df['class']).tolist() == (x * 2).tolist()
    assert 'X!1' in df._column_aliases
    assert (df.copy()['X!1'] * 2).tolist() == (x * 2).tolist()

    path = str(tmpdir.join('test.hdf5'))
    df.export(path)
    df = vaex.open(path)
    assert df['X!1'].tolist() == x.tolist()
    assert (df.copy()['X!1'] * 2).tolist() == (x * 2).tolist()
Exemplo n.º 16
0
    def setup(self, N):
        self.df = vaex.open(generate_numerical())[:N]
        self.df.categorize(self.df.i8_10, min_value=5, max_value=15, inplace=True)
        self.df.categorize(self.df.i4_10, min_value=5, max_value=15, inplace=True)
        self.df.categorize(self.df.i2_10, min_value=5, max_value=15, inplace=True)
        self.df.categorize(self.df.i1_10, min_value=5, max_value=15, inplace=True)

        self.df.categorize(self.df.i8_1K, min_value=5, max_value=1_000+5, inplace=True)
        self.df.categorize(self.df.i4_1K, min_value=5, max_value=1_000+5, inplace=True)
        self.df.categorize(self.df.i2_1K, min_value=5, max_value=1_000+5, inplace=True)
        # self.df.categorize(self.df.i1_1K, min_value=5, max_value=1_000+5)

        self.df.categorize(self.df.i8_1M, min_value=5, max_value=1_000_000+5, inplace=True)
        self.df.categorize(self.df.i4_1M, min_value=5, max_value=1_000_000+5, inplace=True)
Exemplo n.º 17
0
def open_main(argv):
    import argparse
    parser = argparse.ArgumentParser(argv[0])
    parser.add_argument('--verbose', '-v', default=False, action='store_true', help="give extra output")
    parser.add_argument('--quiet', '-q', default=False, action='store_true', help="do not output anything")
    parser.add_argument('--dry-run', '-n', default=False, action='store_true', help="do not actually execute commands (like delete)")
    parser.add_argument('--delete', help="Delete file when reading fails", default=False, action='store_true')
    parser.add_argument("input", help="list of files to try to open", nargs="*")

    args = parser.parse_args(argv[1:])
    import vaex
    import vaex.file
    failed = False
    if args.verbose:
        print(f"Checking files {', '.join(args.input)}")
    for path in args.input:
        try:
            vaex.open(path)
        except BaseException as e:
            failed = True
            if not args.quiet:
                print(e)
            if args.delete:
                if not args.quiet:
                    print(f'rm {path}')
                if not args.dry_run:
                    try:
                        vaex.file.remove(path)
                    except FileNotFoundError:
                        pass
    if args.verbose:
        if failed:
            print("Oops, had issues opening some files")
        else:
            print("All files could be opened")
    return 123 if failed else 0
Exemplo n.º 18
0
def test_open_several_medium_csv_convert():
    csv_glob = '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-01_*.csv'
    for path in glob.glob(csv_glob):
        os.remove(path + '.hdf5')
    os.remove(
        '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-01_0.csv_and_3_more.hdf5'
    )

    start = datetime.now()
    df = vaex.open(csv_glob, convert=True)
    duration = datetime.now() - start
    print('it took {} to convert {:,} rows, which is {:,} rows per second'.
          format(duration, df.length(),
                 int(df.length() / duration.total_seconds())))
    assert df.length() == 3_999_999
Exemplo n.º 19
0
def test_invalid_name_read(tmpdir):
    # earlier version of vaex could write invalid names, check if we can read those
    df = vaex.from_dict({'x': x})
    # df.columns['1'] = df.columns.pop('x')
    # df.column_names = ['1']
    path = str(tmpdir.join('test.hdf5'))
    df.export(path)

    h5 = h5py.File(path)
    h5['/table/columns']['1'] = h5['/table/columns']['x']
    del h5['/table/columns']['x']

    df = vaex.open(path)
    assert df['1'].tolist() == x.tolist()
    assert (df.copy()['1'] * 2).tolist() == (x * 2).tolist()
Exemplo n.º 20
0
def test_categorical(tmpdir):
    # based on https://github.com/vaexio/vaex/issues/399
    path = str(tmpdir.join('test.arrow'))
    table = pa.Table.from_pandas(pdf)

    with pa.OSFile(path, 'wb') as sink:
        with pa.RecordBatchStreamWriter(sink, table.schema) as writer:
            writer.write_table(table)
    with pa.OSFile(path, 'rb') as source:
        pdf2 = pa.ipc.open_stream(source).read_pandas()


    df = vaex.open(path)
    assert df.col1.tolist() == ["DEF"]
    assert df.is_category(df.col1)
    assert df.category_labels(df.col1) == ['ABC', 'DEF']
Exemplo n.º 21
0
def load_titanic():
    '''
    Returns the classic Titanic dataset.

    Description of the columns can be found in dataset.description.

    Example:
    ========

    >>> import vaex.ml
    >>> df = vaex.mk.dataset.load_titanic()
    >>> print(df.description)
    >>> df.describe()
    '''
    dirname = os.path.dirname(__file__)
    return vaex.open(os.path.join(dirname, 'titanic.hdf5'))
Exemplo n.º 22
0
def test_partitioning_basics_hive():
    shutil.rmtree(data_path / 'parquet_dataset_partitioned_hive',
                  ignore_errors=True)

    pq.write_to_dataset(table,
                        data_path / 'parquet_dataset_partitioned_hive',
                        partition_cols=['year', 'country'])
    ds = pa.dataset.dataset(data_path / 'parquet_dataset_partitioned_hive',
                            partitioning="hive")  #, format="parquet", )
    # import pdb; pdb.set_trace()
    df = vaex.open(data_path / 'parquet_dataset_partitioned_hive',
                   partitioning="hive")
    # import pdb; pdb.set_trace()
    assert set(df.value.tolist()) == set(values)
    assert set(df.year.tolist()) == set(years)
    assert set(df.country.tolist()) == set(countries)
Exemplo n.º 23
0
def doCalculate(filename):
    df = vaex.open(f'{filename}.arrow')
    startTime = int(time.time())
    # df = vaex.open('MSLR-WEB10K/Fold1/train.arrow')
    qidList = df.qid.unique()
    ndcgList = []
    for qid in qidList:
        df.select(df.qid == qid)
        true_relevance = df.evaluate(df.relevance, selection=True)
        if len(true_relevance) < 2:   
            ndcgList.append(1)  
            continue
        scores = np.asarray(range(0, len(true_relevance))[: :-1])
        ndcg = ndcg_score(np.asarray([true_relevance]), np.asarray([scores]))
        ndcgList.append(ndcg)
    endTime = int(time.time())
    print(f'time used {endTime-startTime} sec')
    print(np.array(ndcgList).mean())
Exemplo n.º 24
0
def read_dataframe_from_file(path: str) -> Optional[vaex.dataframe.DataFrame]:
    """Only read dataframe present in data/processed.

    Args:
        path: path relative to data/processed.

    Returns:
        vaex dataframe.
    """
    path_to_processed_data = os.path.join(get_base_data_path(), "processed",
                                          path)
    file_path = f"{path_to_processed_data}.arrow"

    if not os.path.exists(file_path):
        raise OSError

    else:
        return vaex.open(file_path)
Exemplo n.º 25
0
def test_open_two_big_csv_convert():
    big_and_biggest_csv = '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-h1*.csv'
    os.remove(
        '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-h1_01.csv.hdf5'
    )
    os.remove(
        '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-h1.csv.hdf5'
    )
    os.remove(
        '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-h1.csv_and_1_more.hdf5'
    )

    start = datetime.now()
    df = vaex.open(big_and_biggest_csv, convert=True)
    duration = datetime.now() - start
    print('it took {} to convert {:,} rows, which is {:,} rows per second'.
          format(duration, df.length(),
                 int(df.length() / duration.total_seconds())))
Exemplo n.º 26
0
def _from_csv_convert_and_read(filename_or_buffer, path_output, chunk_size, fs_options, fs=None, copy_index=False, progress=None, **kwargs):
    # figure out the CSV file path
    csv_path = vaex.file.stringyfy(filename_or_buffer)
    path_output_bare, ext, _ = vaex.file.split_ext(path_output)

    combined_hdf5 = _convert_name(csv_path)

    # convert CSV chunks to separate HDF5 files
    import pandas as pd
    converted_paths = []
    # we don't have indeterminate progress bars, so we cast it to truethy
    progress = bool(progress) if progress is not None else False
    if progress:
        print("Converting csv to chunk files")
    with vaex.file.open(filename_or_buffer, fs_options=fs_options, fs=fs, for_arrow=True) as f:
        csv_reader = pd.read_csv(filename_or_buffer, chunksize=chunk_size, **kwargs)
        for i, df_pandas in enumerate(csv_reader):
            df = vaex.from_pandas(df_pandas, copy_index=copy_index)
            chunk_name = f'{path_output_bare}_chunk_{i}{ext}'
            df.export(chunk_name)
            converted_paths.append(chunk_name)
            log.info('saved chunk #%d to %s' % (i, chunk_name))
            if progress:
                print("Saved chunk #%d to %s" % (i, chunk_name))

    # combine chunks into one HDF5 file
    if len(converted_paths) == 1:
        # no need to merge several HDF5 files
        os.rename(converted_paths[0], path_output)
    else:
        if progress:
            print('Converting %d chunks into single file %s' % (len(converted_paths), path_output))
        log.info('converting %d chunks into single file %s' % (len(converted_paths), path_output))
        dfs = [vaex.open(p) for p in converted_paths]
        df_combined = vaex.concat(dfs)
        df_combined.export(path_output, progress=progress)

        log.info('deleting %d chunk files' % len(converted_paths))
        for df, df_path in zip(dfs, converted_paths):
            try:
                df.close()
                os.remove(df_path)
            except Exception as e:
                log.error('Could not close or delete intermediate file %s used to convert %s to single file: %s', (df_path, csv_path, path_output))
Exemplo n.º 27
0
    def __init__(self, path2hdf, path2tdf):
        """
        Args:
            path2hdf (str): Path to the hdf5 stored timsTOF dataset.
            path2tdf (str): Path to the 'analysis.tdf' sqlite3 DB.
        """
        import vaex

        self.path2hdf = pathlib.Path(path2hdf)
        self.path2tdf = pathlib.Path(path2tdf)
        self.df = vaex.open(str(self.path2hdf))
        self.columns = tuple(self.df.columns)
        self.frames = self.table2df('frames')
        self.min_frame = self.frames.Id.min()
        self.max_frame = self.frames.Id.max()
        self.frames_no = self.max_frame-self.min_frame+1
        self._ms1_mask = self.frames.MsMsType.values == 0
        self.ms1_frames = self.frames.Id[self._ms1_mask].values
        self.retention_time = self.frames.Time
Exemplo n.º 28
0
def test_pick_file(tmpdir, file_extension):
    x = np.arange(N_rows, dtype='i8')
    df = vaex.from_arrays(x=x, x2=-x)
    df['y'] = df.x**2
    data = pickle.dumps(df)
    # if the data is in memory, pickle will be large
    assert len(data) > len(x) * x.itemsize
    xsum = df.x.sum()
    ysum = df.y.sum()

    # but on disk, it should just pickle the file path
    # TODO: arrow is not supported yet
    for ext in 'hdf5 parquet'.split():
        path = tmpdir / f'test.{ext}'
        df.export(path)
        df = vaex.open(path)
        data = pickle.dumps(df)
        assert len(data) < 1000
        assert df.x.sum() == xsum
        assert df.y.sum() == ysum
Exemplo n.º 29
0
def test_add_invalid_name(tmpdir):
    # support invalid names and keywords
    df = vaex.from_dict({'X!1': x, 'class': x*2})
    assert df.get_column_names() == ['X!1', 'class']
    assert df.get_column_names(alias=False) != ['X!1', 'class']
    assert df['X!1'].tolist() == x.tolist()
    assert (df['X!1']*2).tolist() == (x*2).tolist()
    assert (df['class']).tolist() == (x*2).tolist()
    assert 'X!1' in df._column_aliases
    assert (df.copy()['X!1']*2).tolist() == (x*2).tolist()

    path = str(tmpdir.join('test.hdf5'))
    df.export(path)
    df = vaex.open(path)
    assert df['X!1'].tolist() == x.tolist()
    assert (df.copy()['X!1']*2).tolist() == (x*2).tolist()
    assert (df[['X!1']]['X!1']*2).tolist() == (x*2).tolist()

    df_concat = vaex.concat([df, df])
    assert (df_concat[['X!1']]['X!1']*2).tolist() == ((x*2).tolist() + (x*2).tolist())
Exemplo n.º 30
0
def test_add_invalid_name(tmpdir):
    # support invalid names and keywords
    df = vaex.from_dict({'X!1': x, 'class': x*2})
    assert str(df['X!1']) != 'X!1', "invalid identifier cannot be an expression"
    assert str(df['class']) != 'class', "keyword cannot be an expression"
    assert df.get_column_names() == ['X!1', 'class']
    assert df['X!1'].tolist() == x.tolist()
    assert (df['X!1']*2).tolist() == (x*2).tolist()
    assert (df['class']).tolist() == (x*2).tolist()
    assert (df.copy()['X!1']*2).tolist() == (x*2).tolist()

    path = str(tmpdir.join('test.hdf5'))
    df.export(path)
    df = vaex.open(path)
    assert df['X!1'].tolist() == x.tolist()
    assert (df.copy()['X!1']*2).tolist() == (x*2).tolist()
    assert (df[['X!1']]['X!1']*2).tolist() == (x*2).tolist()

    df_concat = vaex.concat([df, df])
    assert (df_concat[['X!1']]['X!1']*2).tolist() == ((x*2).tolist() + (x*2).tolist())
Exemplo n.º 31
0
def main(argv):

    parser = argparse.ArgumentParser(argv[0])
    parser.add_argument("filename", help="filename for dataset", nargs='*')
    parser.add_argument("--address", help="address to bind the server to (default: %(default)s)", default="0.0.0.0")
    parser.add_argument("--port", help="port to listen on (default: %(default)s)", type=int, default=9000)
    parser.add_argument('--verbose', '-v', action='count', default=2)
    parser.add_argument('--cache', help="cache size in bytes for requests, set to zero to disable (default: %(default)s)", type=int, default=500000000)
    parser.add_argument('--compress', help="compress larger replies (default: %(default)s)", default=True, action='store_true')
    parser.add_argument('--no-compress', dest="compress", action='store_false')
    parser.add_argument('--development', default=False, action='store_true', help="enable development features (auto reloading)")
    parser.add_argument('--threads-per-job', default=4, type=int, help="threads per job (default: %(default)s)")
    # config = layeredconfig.LayeredConfig(defaults, env, layeredconfig.Commandline(parser=parser, commandline=argv[1:]))
    config = parser.parse_args(argv[1:])

    verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"]
    logging.getLogger("vaex").setLevel(verbosity[config.verbose])
    # import vaex
    # vaex.set_log_level_debug()
    from vaex.settings import webserver as settings

    # filenames = config.filenames
    filenames = []
    filenames = config.filename
    datasets = []
    for filename in filenames:
        ds = vx.open(filename)
        if ds is None:
            print("error opening file: %r" % filename)
        else:
            datasets.append(ds)
    datasets = datasets or [vx.example()]
    # datasets = [ds for ds in datasets if ds is not None]
    logger.info("datasets:")
    for dataset in datasets:
        logger.info("\thttp://%s:%d/%s or ws://%s:%d/%s", config.address, config.port, dataset.name, config.address, config.port, dataset.name)
    server = WebServer(datasets=datasets, address=config.address, port=config.port, cache_byte_size=config.cache,
                       compress=config.compress, development=config.development,
                       threads_per_job=config.threads_per_job)
    server.serve()
Exemplo n.º 32
0
def main(argv):

	parser = argparse.ArgumentParser(argv[0])
	parser.add_argument("filename", help="filename for dataset", nargs='*')
	parser.add_argument("--address", help="address to bind the server to (default: %(default)s)", default="0.0.0.0")
	parser.add_argument("--port", help="port to listen on (default: %(default)s)", type=int, default=9000)
	parser.add_argument('--verbose', '-v', action='count', default=2)
	parser.add_argument('--cache', help="cache size in bytes for requests, set to zero to disable (default: %(default)s)", type=int, default=500000000)
	parser.add_argument('--compress', help="compress larger replies (default: %(default)s)", default=True, action='store_true')
	parser.add_argument('--no-compress', dest="compress", action='store_false')
	parser.add_argument('--development', default=False, action='store_true', help="enable development features (auto reloading)")
	parser.add_argument('--threads-per-job', default=4, type=int, help="threads per job (default: %(default)s)")
	#config = layeredconfig.LayeredConfig(defaults, env, layeredconfig.Commandline(parser=parser, commandline=argv[1:]))
	config = parser.parse_args(argv[1:])

	verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"]
	logging.getLogger("vaex").setLevel(verbosity[config.verbose])
	#import vaex
	#vaex.set_log_level_debug()
	from vaex.settings import webserver as settings

	#filenames = config.filenames
	filenames = []
	filenames = config.filename
	datasets = []
	for filename in filenames:
		ds = vx.open(filename)
		if ds is None:
			print("error opening file: %r" % filename)
		else:
			datasets.append(ds)
	datasets = datasets or [vx.example()]
	#datasets = [ds for ds in datasets if ds is not None]
	logger.info("datasets:")
	for dataset in datasets:
		logger.info("\thttp://%s:%d/%s or ws://%s:%d/%s", config.address, config.port, dataset.name, config.address, config.port, dataset.name)
	server = WebServer(datasets=datasets, address=config.address, port=config.port, cache_byte_size=config.cache,
					   compress=config.compress, development=config.development,
					   threads_per_job=config.threads_per_job)
	server.serve()
Exemplo n.º 33
0
def load_iris():
    '''Load and return the iris dataset (classification).

    The iris dataset is a classic and very easy multi-class classification dataset.

    =================   ==============
    Classes                          3
    Samples per class               50
    Samples total                  150
    Dimensionality                   4
    Features            real, positive
    =================   ==============

    Example:
    ========

    >>> import vaex.ml
    >>> df = vaex.ml.datasets.load_iris()
    >>> df.describe()
    '''
    dirname = os.path.dirname(__file__)
    return vaex.open(os.path.join(dirname, 'iris.hdf5'))
Exemplo n.º 34
0
def binary_to_df(file: str, filePattern: str):
    """Convert any binary formats into vaex dataframe

    Args:
        file (str): Path to input file.
        filePattern (str): extension of file to convert.

    Returns:
        Vaex dataframe.

    """
    binary_patterns = [".*.fits", ".*.arrow", ".*.parquet", ".*.hdf5"]

    logger.info("binary_to_df: Scanning directory for binary file pattern... ")
    if filePattern in binary_patterns:
        # convert hdf5 to vaex df
        df = vaex.open(file)
        return df
    else:
        raise FileNotFoundError(
            "No supported binary file extensions were found in the directory. Please check file directory again."
        )
Exemplo n.º 35
0
def create_scats_ml_model():
    print("starting scats ml modeling")

    # load existing csv into vaex dataframe
    if not os.path.exists(finalScatsPath + ".hdf5"):
        vaex.from_csv(finalScatsPath, convert=True, copy_index=False, chunk_size=1_000_000)

    df = vaex.open(finalScatsPath + ".hdf5", shuffle=True)

    # transform the features into more machine learning friendly vars
    pca_coord = vaex.ml.PCA(features=["lat", "lon"], n_components=2, prefix="pca")
    df = pca_coord.fit_transform(df)

    cycl_transform_hour = vaex.ml.CycleTransformer(features=["hour"], n=24)
    df = cycl_transform_hour.fit_transform(df)

    cycl_transform_dow = vaex.ml.CycleTransformer(features=["dow"], n=7)
    df = cycl_transform_dow.fit_transform(df)

    print("dataWrangling done, ready to create model, time: {}s".format(duration()))

    # create a randomForestRegression model
    vaex_model = Predictor(
        features=df.get_column_names(regex="pca[\d]") + df.get_column_names(regex=".*_[xy]"),
        target="avg_vol",
        model=RandomForestRegressor(random_state=42, n_estimators=7 * 24),
        prediction_name="p_avg_vol",
    )

    # here we fit and train the model
    with parallel_backend("threading", n_jobs=8):
        vaex_model.fit(df)
        print("\n\nmodel created, time: {}s".format(duration()))

        dump(value=vaex_model, filename=model_out, compress=3)

    print("model written to output, time: {}s".format(duration()))
    return
def main():
    print(f'HDF5 Stored Size: {CSV_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB')

    stream = pd.read_csv(
        CSV_FILE_PATH,
        chunksize=CHUNK_SIZE,
        low_memory=False,
        sep=',',
        encoding='latin-1',
    )
    TMP_PATH.mkdir(parents=True, exist_ok=True)
    for i, chunk in enumerate(stream):
        print(f'Processing {i + 1}-th chunk containing "{len(chunk)}" rows of data...')
        df_chunk = vaex.from_pandas(chunk, copy_index=False)
        export_path = TMP_PATH / f'part_{i}.hdf5'
        df_chunk.export_hdf5(str(export_path))

    df = vaex.open(str(TMP_PATH / 'part*'))

    df.export_hdf5(str(COLUMNAR_HDF_FILE_PATH))
    print(f'HDF5 Stored Size: {COLUMNAR_HDF_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB')

    rmtree(TMP_PATH)
Exemplo n.º 37
0
def vaex_read(fn):
    import vaex

    df = vaex.open(fn)
    print(df.schema())
    print(df)
Exemplo n.º 38
0
parser.add_argument('--npandas', dest="npandas", type=float, default=7, help="number of rows to use for pandas")
parser.add_argument('--filter', dest="filter", default=None, help="filter for benchmark")
parser.add_argument('--filename', default=default_filename, help='filename to use for benchmark export/reading')
args = parser.parse_args(argv[1:])

use_dask = False


if not os.path.exists(args.filename):
    x = np.arange(0, int(10**args.nmax))
    xs = x.astype(str)
    s = xs#vaex.string_column(xs)
    df_vaex = vaex.from_arrays(x=s, s=s)
    df_vaex.export(args.filename, progress=True, shuffle=True)

df = vaex.open(args.filename)
df_vaex = df[0:int(10**args.n)]
df_vaex.executor.buffer_size = len(df_vaex)//args.partitions
df_pandas = df[:int(10**args.npandas)].to_pandas_df()

if use_dask:
    df_dask = dd.from_pandas(df_pandas, npartitions=4)
timings = {}
def mytimeit(expr, N, scope):
    times = []
    for i in range(N):
        t0 = time.time()
        eval(expr, scope)
        times.append(time.time() - t0)
    return times
def test(name, expr):
Exemplo n.º 39
0
def main(argv):
    import argparse
    parser = argparse.ArgumentParser(argv[0])
    parser.add_argument('--verbose', '-v', action='count', default=0)
    parser.add_argument('--list', '-l', default=False, action='store_true', help="list columns of input")

    subparsers = parser.add_subparsers(help='type of subtask', dest="task")

    parser_export = subparsers.add_parser('export', help='read meta info')
    parser_export.add_argument("input", help="input dataset")
    parser_export.add_argument('output', help='output file (.yaml or .json)')
    parser_export.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*")
    parser_export.add_argument('--all', dest="all", action='store_true', default=False, help="Also export missing values (useful for having a template)")

    parser_import = subparsers.add_parser('import', help='read meta info')
    parser_import.add_argument('input', help='input meta file (.yaml or .json)')
    parser_import.add_argument("output", help="output dataset")
    parser_import.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*")
    parser_import.add_argument('--overwrite', help="overwrite existing entries", default=False, action='store_true')
    parser_import.add_argument('--description', help="overwrite description", default=None)

    args = parser.parse_args(argv[1:])

    verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"]
    logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)])

    if args.task == "export":
        ds = vaex.open(args.input)
        column_names = ds.get_column_names(strings=True, virtual=True)
        if args.all:
            output_data = dict(description=ds.description,
                               descriptions={name: ds.descriptions.get(name, "") for name in column_names},
                               ucds={name: ds.ucds.get(name, "") for name in column_names},
                               units={name: str(ds.units.get(name, "")) for name in column_names},  # {name:str(unit) for name, unit in ds.units.items()},
                               )
        else:
            output_data = dict(description=ds.description,
                               descriptions=ds.descriptions,
                               ucds=ds.ucds,
                               units={name: str(unit) for name, unit in ds.units.items()},
                               )
        if args.output == "-":
            yaml.safe_dump(output_data, sys.stdout, default_flow_style=False)  # , encoding='utf-8',  allow_unicode=True)
        else:
            vaex.utils.write_json_or_yaml(args.output, output_data)
            print("wrote %s" % args.output)
    if args.task == "import":
        if args.input == "-":
            data = yaml.load(sys.stdin)
        else:
            data = vaex.utils.read_json_or_yaml(args.input)

        ds = vaex.open(args.output)

        units = data["units"]
        ucds = data["ucds"]
        descriptions = data["descriptions"]
        if args.description:
            ds.description = args.description
        else:
            if ds.description is None or args.overwrite:
                ds.description = data["description"]
        for column_name in ds.get_column_names(strings=True):
            if column_name not in descriptions:
                print(column_name, 'missing description')
            else:
                print('>>>', column_name, descriptions[column_name])
            if (args.overwrite or column_name not in ds.units) and column_name in units:
                ds.units[column_name] = astropy.units.Unit(units[column_name])
            if (args.overwrite or column_name not in ds.ucds) and column_name in ucds:
                ds.ucds[column_name] = ucds[column_name]
            if (args.overwrite or column_name not in ds.descriptions) and column_name in descriptions:
                ds.descriptions[column_name] = descriptions[column_name]
        ds.write_meta()
        print("updated meta data in %s" % args.output)
Exemplo n.º 40
0
 def fetch(self, force_download=False):
     self.download(force=force_download)
     return vx.open(self.filename)
Exemplo n.º 41
0
 def open(self):
     return vx.open_many(self.filenames_vaex) if len(self.filenames_vaex) != 1 else vx.open(self.filenames_vaex[0])
Exemplo n.º 42
0
def main(argv):
    import argparse
    parser = argparse.ArgumentParser(argv[0])
    parser.add_argument('--verbose', '-v', action='count', default=0)
    parser.add_argument('--quiet', '-q', default=False, action='store_true', help="do not output anything")
    parser.add_argument('--list', '-l', default=False, action='store_true', help="list columns of input")
    parser.add_argument('--progress', help="show progress (default: %(default)s)", default=True, action='store_true')
    parser.add_argument('--no-progress', dest="progress", action='store_false')
    parser.add_argument('--shuffle', "-s", dest="shuffle", action='store_true', default=False)
    parser.add_argument('--sort', dest="sort", default=None)
    parser.add_argument('--virtual', dest="virtual", action='store_true', default=False, help="Also export virtual columns")
    parser.add_argument('--fraction', "-f", dest="fraction", type=float, default=1.0, help="fraction of input dataset to export")
    parser.add_argument('--filter', dest="filter", default=None, help="filter to apply before exporting")

    subparsers = parser.add_subparsers(help='type of input source', dest="task")

    parser_soneira = subparsers.add_parser('soneira', help='create soneira peebles dataset')
    parser_soneira.add_argument('output', help='output file')
    parser_soneira.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*")
    parser_soneira.add_argument('--dimension', '-d', type=int, help='dimensions', default=4)
    # parser_soneira.add_argument('--eta','-e', type=int, help='dimensions', default=3)
    parser_soneira.add_argument('--max-level', '-m', type=int, help='dimensions', default=28)
    parser_soneira.add_argument('--lambdas', '-l', type=int, help='lambda values for fractal', default=[1.1, 1.3, 1.6, 2.])

    parser_tap = subparsers.add_parser('tap', help='use TAP (Table Access Protocol) as source')
    parser_tap.add_argument("tap_url", help="input source or file")
    parser_tap.add_argument("table_name", help="input source or file")
    parser_tap.add_argument("output", help="output file (ends in .fits or .hdf5)")
    parser_tap.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*")

    parser_file = subparsers.add_parser('file', help='use a file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)')
    parser_file.add_argument("input", help="input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)")
    parser_file.add_argument("output", help="output file (ends in .fits or .hdf5)")
    parser_file.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*")

    parser_file = subparsers.add_parser('csv', help='use a csv file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)')
    parser_file.add_argument("input", help="input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)")
    parser_file.add_argument("output", help="output file (ends in .hdf5)")
    parser_file.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*")

    args = parser.parse_args(argv[1:])

    verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"]
    logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)])
    dataset = None
    if args.task == "soneira":
        if vaex.utils.check_memory_usage(4 * 8 * 2**args.max_level, vaex.utils.confirm_on_console):
            if not args.quiet:
                print("generating soneira peebles dataset...")
            dataset = vaex.file.other.SoneiraPeebles(args.dimension, 2, args.max_level, args.lambdas)
        else:
            return 1
    if args.task == "tap":
        dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name)
        if not args.quiet:
            print("exporting from {tap_url} table name {table_name} to {output}".format(tap_url=args.tap_url, table_name=args.table_name, output=args.output))
    if args.task == "csv":
        # dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name)
        if not args.quiet:
            print("exporting from {input} to {output}".format(input=args.input, output=args.output))
    if args.task == "file":
        if args.input[0] == "@":
            inputs = open(args.input[1:]).readlines()
            dataset = vaex.open_many(inputs)
        else:
            dataset = vaex.open(args.input)
        if not args.quiet:
            print("exporting from {input} to {output}".format(input=args.input, output=args.output))

    if dataset is None and args.task not in ["csv"]:
        if not args.quiet:
            print("Cannot open input")
        return 1
    if dataset:
        dataset.set_active_fraction(args.fraction)
    if args.list:
        if not args.quiet:
            print("columns names: " + " ".join(dataset.get_column_names()))
    else:
        if args.task == "csv":
            row_count = -1  # the header does not count
            with file(args.input) as lines:
                for line in lines:
                    row_count += 1
                    # print line
            logger.debug("row_count: %d", row_count)
            with file(args.input) as lines:
                line = next(lines).strip()
                # print line
                names = line.strip().split(",")
                line = next(lines).strip()
                values = line.strip().split(",")
                numerics = []
                for value in values:
                    try:
                        float(value)
                        numerics.append(True)
                    except:
                        numerics.append(False)
                names_numeric = [name for name, numeric in zip(names, numerics) if numeric]
                print(names_numeric)
                output = vaex.file.other.Hdf5MemoryMapped.create(args.output, row_count, names_numeric)
                Ncols = len(names)
                cols = [output.columns[name] if numeric else None for name, numeric in zip(names, numerics)]

                def copy(line, row_index):
                    values = line.strip().split(",")
                    for column_index in range(Ncols):
                        if numerics[column_index]:
                            value = float(values[column_index])
                            cols[column_index][row_index] = value
                row = 0
                copy(line, row)
                row += 1
                progressbar = vaex.utils.progressbar(title="exporting") if args.progress else None
                for line in lines:
                    # print line
                    copy(line, row)
                    row += 1
                    if row % 1000:
                        progressbar.update(row / float(row_count))
                progressbar.finish()
                # print names
        else:
            if args.columns:
                columns = args.columns
            else:
                columns = None
            if columns is None:
                columns = dataset.get_column_names(strings=True, virtual=args.virtual)
            for column in columns:
                if column not in dataset.get_column_names(strings=True, virtual=True):
                    if not args.quiet:
                        print("column %r does not exist, run with --list or -l to list all columns" % column)
                    return 1

            base, output_ext = os.path.splitext(args.output)
            if output_ext not in [".hdf5", ".fits", ".arrow"]:
                if not args.quiet:
                    print("extension %s not supported, only .hdf5, .arrow and .fits are" % output_ext)
                return 1

            if not args.quiet:
                print("exporting %d rows and %d columns" % (len(dataset), len(columns)))
                print("columns: " + " ".join(columns))
            progressbar = vaex.utils.progressbar(title="exporting") if args.progress else None

            def update(p):
                if progressbar:
                    progressbar.update(p)
                return True
            if args.filter:
                dataset.select(args.filter, name='export')
                selection = 'export'
            else:
                selection = None
            if output_ext == ".hdf5":
                export_hdf5(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection)
            elif output_ext == ".arrow":
                from vaex_arrow.export import export as export_arrow
                export_arrow(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection)
            elif output_ext == ".fits":
                export_fits(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection)
            if progressbar:
                progressbar.finish()
            if not args.quiet:
                print("\noutput to %s" % os.path.abspath(args.output))
            dataset.close_files()
    return 0
Exemplo n.º 43
0
import vaex as vx
import sys

#import yappi

#vx.set_log_level_debug()
progressbar = True
fn = sys.argv[1]
print "opening", fn
#dataset = vx.open_many([fn])
dataset = vx.open(fn)
#dataset.set_active_fraction(0.5)

expressions = tuple(sys.argv[2:])
if sys.argv[2] == "Alpha":
	dataset.add_virtual_columns_celestial("Alpha", "Delta", "l", "b")
	expressions = ("l", "b")
	for key, value in dataset.virtual_columns.items():
		print key, value
#dsa
print "subspace", expressions
subspace = dataset(*expressions)
#print "calculate minmax"
#yappi.start()
limits = subspace.minmax(progressbar=progressbar)
#print "calculate histogram"
subspace.histogram(limits, progressbar=progressbar)
#yappi.get_func_stats().print_all()

Exemplo n.º 44
0
def test_open():
    with pytest.raises(IOError):
        vaex.open('doesnotexist')


    csv1 = os.path.join(path, 'data', 'small2.csv')
    csv2 = os.path.join(path, 'data', 'small2.csv')
    h51 = os.path.join(path, 'data', 'small2.csv.hdf5')
    h52 = os.path.join(path, 'data', 'small3.csv.hdf5')
    target = os.path.join(path, 'data', 'small2.csv.hdf5')

    vaex.open(csv1, convert=True)
    assert os.path.exists(target)
    os.remove(target)

    target = os.path.join(path, 'data', 'small2.csv_and_1_more.hdf5')
    vaex.open(os.path.join(path, 'data', 'small*.csv'), convert=True)
    assert os.path.exists(target)
    assert os.path.exists(h51)
    assert os.path.exists(h52)
    vaex.open(os.path.join(path, 'data', 'small?.csv.hdf5'))
    os.remove(target)
    os.remove(h51)
    os.remove(h52)

    # convert can also be a path
    target = os.path.join(path, 'data', 'convert.hdf5')
    vaex.open(os.path.join(path, 'data', 'small*.csv'), convert=target)
    assert os.path.exists(target)
    assert os.path.exists(h51)
    assert os.path.exists(h52)
    vaex.open(os.path.join(path, 'data', 'small?.csv.hdf5'))
    os.remove(target)
    os.remove(h51)
    os.remove(h52)

    target = os.path.join('custom.hdf5')
    vaex.open(os.path.join(path, 'data', 'small*.csv'), convert=target)
    assert os.path.exists(h51)
    assert os.path.exists(h52)
    assert os.path.exists(target)
    os.remove(target)
    os.remove(h51)
    os.remove(h52)