示例#1
0
def test_string():
    df = vaex.from_dict({"A": ["a", None, "cdef", "", "g"]})
    col = df.__dataframe__().get_column_by_name("A")

    assert col._col.tolist() == df.A.tolist()
    assert col.size == 5
    assert col.null_count == 1
    assert col.dtype[0] == _DtypeKind.STRING
    assert col.describe_null == (3, 0)

    df2 = _from_dataframe_to_vaex(df.__dataframe__())
    assert df2.A.tolist() == df.A.tolist()
    assert df2.__dataframe__().get_column_by_name("A").null_count == 1
    assert df2.__dataframe__().get_column_by_name("A").describe_null == (3, 0)
    assert df2.__dataframe__().get_column_by_name(
        "A").dtype[0] == _DtypeKind.STRING

    df_sliced = df[1:]
    col = df_sliced.__dataframe__().get_column_by_name("A")
    assert col.size == 4
    assert col.null_count == 1
    assert col.dtype[0] == _DtypeKind.STRING
    assert col.describe_null == (3, 0)

    df2 = _from_dataframe_to_vaex(df_sliced.__dataframe__())
    assert df2.A.tolist() == df_sliced.A.tolist()
    assert df2.__dataframe__().get_column_by_name("A").null_count == 1
    assert df2.__dataframe__().get_column_by_name("A").describe_null == (3, 0)
    assert df2.__dataframe__().get_column_by_name(
        "A").dtype[0] == _DtypeKind.STRING
示例#2
0
文件: common.py 项目: stjordanis/vaex
 def create(**arrays):
     def try_convert(ar):
         try:
             return array_factory_arrow_chunked(ar)
         except:
             return ar
     return vaex.from_dict({k: try_convert(v) for k, v in arrays.items()})
示例#3
0
文件: common.py 项目: stjordanis/vaex
 def create(**arrays):
     def try_convert(ar):
         try:
             return pa.array(ar)
         except:
             return ar
     return vaex.from_dict({k: try_convert(v) for k, v in arrays.items()})
示例#4
0
def test_apply_with_invalid_identifier():
    df = vaex.from_dict({"#": [1], "with space": [2]})

    def add(a, b):
        return a + b

    assert df.apply(add, arguments=[df["#"], df["with space"]]).tolist() == [3]
示例#5
0
def from_records(records: List[Dict],
                 array_type="arrow",
                 defaults={}) -> vaex.dataframe.DataFrame:
    '''Create a dataframe from a list of dict.

    .. warning:: This is for convenience only, for performance pass arrays to :func:`from_arrays` for instance.

    :param str array_type: {array_type}
    :param dict defaults: default values if a record has a missing entry
    '''
    arrays = dict()
    for i, record in enumerate(records):
        for name, value in record.items():
            if name not in arrays:
                # prepend None's
                arrays[name] = [defaults.get(name)] * i
            arrays[name].append(value)
        for name in arrays:
            if name not in record:
                # missing values get replaced
                arrays[name].append(defaults.get(name))
    arrays = {
        k: vaex.array_types.convert(v, array_type)
        for k, v in arrays.items()
    }
    return vaex.from_dict(arrays)
示例#6
0
def test_combined_grouper_over64bit():
    bits = [15, 16, 17] * 2
    assert sum(bits) > 64
    N = 2**max(bits)

    def unique_ints(offset, bit):
        # create 2**bits unique ints
        ar = np.full(N, offset, dtype='int32')
        n = 2**bit
        ar[:n] = np.arange(offset, offset + n)
        return ar

    arrays = {f'x_{i}': unique_ints(i, bit) for i, bit in enumerate(bits)}
    names = list(arrays)
    df = vaex.from_dict(arrays)
    grouper = df.groupby(names)
    dfg = grouper.agg('count')
    for i, bit in enumerate(bits):
        xi = dfg[f'x_{i}'].to_numpy()
        assert len(xi) == N
        xiu = np.unique(xi)
        Ni = 2**bits[i]
        assert len(xiu) == Ni
    assert dfg['count'].sum() == N
    with pytest.raises(vaex.RowLimitException, match='.* >= 2 .*'):
        df.groupby(names, row_limit=2)
    with pytest.raises(vaex.RowLimitException):
        df.groupby([names[0]], row_limit=2**bits[0] - 1)
示例#7
0
文件: groupby.py 项目: t-triobox/vaex
 def process(_ignore):
     logger.info(f"extracing indices of parent groupers ({self.N:,} unique rows)")
     df = vaex.from_dict({'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values})
     df[f'index_0'] = df['bin_value'] // multipliers[0]
     df[f'leftover_0'] = df[f'bin_value'] % multipliers[0]
     for i in range(1, len(multipliers)):
         df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i]
         df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i]
     columns = [f'index_{i}' for i in range(len(multipliers))]
     indices_parents = df.evaluate(columns, progress=progressbar)
     def compress(ar):
         if vaex.dtype_of(ar).kind == 'i':
             ar = vaex.array_types.to_numpy(ar)
             max_value = ar.max()
             ar = ar.astype(vaex.utils.required_dtype_for_max(max_value))
             return ar
     indices_parents = [compress(ar) for ar in indices_parents]
     bin_values = {}
     logger.info(f"extracing labels of parent groupers...")
     # NOTE: we can also use dict encoding instead of take
     for indices, parent in zip(indices_parents, parents):
         if sort:
             assert parent.pre_sort, "cannot sort while parent not presorted"
             assert parent.sort_indices is None
         dtype = vaex.dtype_of(parent.bin_values)
         if dtype.is_struct:
             # collapse parent struct into our flat struct
             for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()):
                 bin_values[field.name] = ar.take(indices)
                 # bin_values[field.name] = pa.DictionaryArray.from_arrays(indices, ar)
         else:
             bin_values[parent.label] = parent.bin_values.take(indices)
             # bin_values[parent.label] = pa.DictionaryArray.from_arrays(indices, parent.bin_values)
     logger.info(f"extracing labels of parent groupers done")
     return pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
示例#8
0
 def agg(self, actions):
     # TODO: this basically forms a cartesian product, we can do better, use a
     # 'multistage' hashmap
     arrays = super(GroupBy, self)._agg(actions)
     # we don't want non-existing pairs (e.g. Amsterdam in France does not exist)
     counts = self.counts
     if counts is None:  # nobody wanted to know count*, but we need it
         count_agg = vaex.agg.count(edges=True)
         counts = self.df._agg(count_agg, self.grid, delay=_USE_DELAY)
     self.df.execute()
     if _USE_DELAY:
         arrays = {key: value.get() for key, value in arrays.items()}
         counts = counts.get()
     # take out the edges
     arrays = {
         key: vaex.utils.extract_central_part(value)
         for key, value in arrays.items()
     }
     counts = vaex.utils.extract_central_part(counts)
     mask = counts > 0
     coords = [
         coord[mask] for coord in np.meshgrid(*self.coords1d, indexing='ij')
     ]
     labels = {
         str(by.expression): coord
         for by, coord in zip(self.by, coords)
     }
     df_grouped = vaex.from_dict(labels)
     for key, value in arrays.items():
         df_grouped[key] = value[mask]
     return df_grouped
示例#9
0
def update_flow_figures(days, hours, zone):
    logger.info(
        'Figure: update sankey and sunburst for days=%r hours=%r zone=%r',
        days, hours, zone)
    flow_data = compute_flow_data(days, hours, zone)
    df_outflow_top = vaex.from_dict(flow_data['outflow_top'])
    df_outflow_rest = vaex.from_dict(flow_data['outflow_rest'])
    df_outflow_borough = vaex.from_dict(flow_data['outflow_borough'])

    pickup_zone = zone
    fig_sankey = create_figure_sankey(df_outflow_top, df_outflow_rest,
                                      df_outflow_borough, pickup_zone)
    fig_sunburst = create_figure_sunburst(df_outflow_top, df_outflow_rest,
                                          df_outflow_borough, pickup_zone)
    table_records, table_style = create_table_data(df_outflow_top)

    return fig_sankey, fig_sunburst, table_records, table_style, 'trigger loader'
示例#10
0
    def _append_listed_dict_to_df(self, data, check_unique=True):
        """Append pre-processed dict to self._df.

        Args:
            data (dict): data to add
            check_unique (bool): if True, it will be checked that the data is unique in the db

        """
        if self._df is None:
            self._df = vaex.from_dict(data)
        else:
            if check_unique:
                # TODO: support unique-check for multiple items
                df_uuid = data['uuid_in_df'][0]
                if len(self.df[self.df.uuid_in_df.str.equals(df_uuid)]) > 0:
                    logging.warning('Given data already exist in dataframe: {}'.format(df_uuid))
                    return
            self.df = self.df.concat(vaex.from_dict(data))
示例#11
0
class MlTest(TestCase):
    model_df = vaex.from_dict({
        "route_id":
        array(["60-155-d12-1"], dtype=object),
        "start_date":
        array([20210302]),
        "start_time":
        array(["19:20:00"], dtype=object),
        "stop_sequence":
        array([24]),
        "arrival":
        array([5.0]),
        "timestamp":
        array(["2021-03-02 19:51:26"], dtype=object),
        "stop_id":
        array(["8220DB000264"], dtype=object),
        "arrival_time":
        array(["19:38:38"], dtype=object),
        "shape_dist_traveled":
        array([7818.16]),
        "direction":
        array(["0"], dtype=object),
        "route_id":
        array(["60-155-d12-1"], dtype=object),
        "lat":
        array([53.3535353]),
        "lon":
        array([-6.26225863]),
        "direction_angle":
        array([139.31470635]),
        "shape_dist_between":
        array([518.6]),
        "arr_dow":
        array([1]),
        "arr_hour":
        array([19]),
        "arrival_mean":
        array([6.0]),
        "p_mean_vol":
        array([68.53864425]),
    })

    def test_files(self):
        [
            self.assertEqual(os.path.exists(p), True) for p in [
                gtfsr_historical_means_path, stop_time_data_path,
                gtfsr_model_path
            ]
        ]

    def test_model(self):
        self.model_df.state_load(gtfsr_model_path)

        pred_val = self.model_df[["p_arrival_lgbm"]][0][0]
        self.assertTrue(pred_val)
示例#12
0
def test_ipython_autocompletion(ds_local):
    df = vaex.from_dict({
        'First name': ['Reggie', 'Tamika'],
        'Last name': ['Miller', 'Catchings'],
        '$amount': [10, 20]
    })

    completions = df._ipython_key_completions_()
    assert 'First name' in completions
    assert 'Last name' in completions
    assert '$amount' in completions
    assert 'Team' not in completions
示例#13
0
def test_add_invalid_name(tmpdir):
    # support invalid names and keywords
    df = vaex.from_dict({'X!1': x, 'class': x * 2})
    assert df['X!1'].tolist() == x.tolist()
    assert (df['X!1'] * 2).tolist() == (x * 2).tolist()
    assert (df['class']).tolist() == (x * 2).tolist()
    assert 'X!1' in df._column_aliases
    assert (df.copy()['X!1'] * 2).tolist() == (x * 2).tolist()

    path = str(tmpdir.join('test.hdf5'))
    df.export(path)
    df = vaex.open(path)
    assert df['X!1'].tolist() == x.tolist()
    assert (df.copy()['X!1'] * 2).tolist() == (x * 2).tolist()
示例#14
0
def _test_df_to_vaex():
    """Convert pandas dataframe to vaex."""
    def _serialize(element):
        if isinstance(element, list):
            return ':'.join(element)
        return element

    with open("test/small_dflist.pkl", "rb") as f:
        df_dict = pickle.load(f)

    # load Pandas DataFrame and Serialize
    content_ = df_dict["content_df"].applymap(_serialize).to_dict('list')
    file_ = df_dict["file_df"].applymap(_serialize).to_dict('list')
    record_id_ = df_dict["record_id_df"].applymap(_serialize).to_dict('list')

    # Create Vaex DataFrame
    content_df = vaex.from_dict(content_)
    file_df = vaex.from_dict(file_)
    record_id_df = vaex.from_dict(record_id_)

    # Export as .arrow
    content_df.export('test/content_df.arrow')
    file_df.export('test/file_df.arrow')
    record_id_df.export('test/record_id_df.arrow')
示例#15
0
def test_invalid_name_read(tmpdir):
    # earlier version of vaex could write invalid names, check if we can read those
    df = vaex.from_dict({'x': x})
    # df.columns['1'] = df.columns.pop('x')
    # df.column_names = ['1']
    path = str(tmpdir.join('test.hdf5'))
    df.export(path)

    h5 = h5py.File(path)
    h5['/table/columns']['1'] = h5['/table/columns']['x']
    del h5['/table/columns']['x']

    df = vaex.open(path)
    assert df['1'].tolist() == x.tolist()
    assert (df.copy()['1'] * 2).tolist() == (x * 2).tolist()
示例#16
0
def test_groupby_datetime():
    data = {'z': [2, 4, 8, 10],
            't': [np.datetime64('2020-01-01'),
                  np.datetime64('2020-01-01'),
                  np.datetime64('2020-02-01'),
                  np.datetime64('2020-02-01')]
            }

    df = vaex.from_dict(data)
    dfg = df.groupby(by='t', sort=True).agg({'z': 'mean'})

    assert dfg.column_count() == 2
    assert dfg.z.tolist() == [3, 9]
    assert dfg.t.dtype.is_datetime
    assert set(dfg.t.tolist()) == {datetime.date(2020, 1, 1), datetime.date(2020, 2, 1)}
示例#17
0
def test_non_identifiers():
    df = vaex.from_dict({'x': [1], 'y': [2], '#':[1]})
    df['z'] = df['#'] + 1
    assert df['z'].variables() == {'#'}
    assert df._virtual_expressions['z'].variables() == {'#'}

    df['1'] = df.x * df.y
    df['2'] = df['1'] + df.x
    assert df['1'].variables(ourself=True) == {'x', 'y', '1'}
    assert df['1'].variables() == {'x', 'y'}
    assert df['2'].variables(ourself=True) == {'x', 'y', '2', '1'}
    assert df['2'].variables(include_virtual=False) == {'x', 'y'}

    df['valid'] = df['2']
    assert df['valid'].variables(ourself=True) == {'x', 'y', '2', '1', 'valid'}
    assert df['valid'].variables(include_virtual=False) == {'x', 'y'}
示例#18
0
    def __init__(self,
                 expression,
                 df,
                 multipliers,
                 parents,
                 sort,
                 row_limit=None):
        '''Will group by 1 expression, which is build up from multiple expressions.

        Used in the sparse/combined group by.
        '''
        super().__init__(expression, df, sort=sort, row_limit=row_limit)
        assert len(multipliers) == len(parents)

        assert multipliers[-1] == 1
        self.df = df
        self.label = 'SHOULD_NOT_BE_USED'
        self.expression = expression
        # efficient way to find the original bin values (parent.bin_value) from the 'compressed'
        # self.bin_values
        df = vaex.from_dict({
            'row': vaex.vrange(0, self.N, dtype='i8'),
            'bin_value': self.bin_values
        })
        df[f'index_0'] = df['bin_value'] // multipliers[0]
        df[f'leftover_0'] = df[f'bin_value'] % multipliers[0]
        for i in range(1, len(multipliers)):
            df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i]
            df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i]
        columns = [f'index_{i}' for i in range(len(multipliers))]
        indices_parents = df.evaluate(columns)
        bin_values = {}
        for indices, parent in zip(indices_parents, parents):
            dtype = vaex.dtype_of(parent.bin_values)
            if dtype.is_struct:
                # collapse parent struct into our flat struct
                for field, ar in zip(parent.bin_values.type,
                                     parent.bin_values.flatten()):
                    bin_values[field.name] = ar.take(indices)
            else:
                bin_values[parent.label] = parent.bin_values.take(indices)
        self.bin_values = pa.StructArray.from_arrays(bin_values.values(),
                                                     bin_values.keys())
示例#19
0
文件: groupby.py 项目: sthagen/vaex
    def __init__(self, expression, df, multipliers, parents, sort, row_limit=None):
        '''Will group by 1 expression, which is build up from multiple expressions.

        Used in the sparse/combined group by.
        '''
        super().__init__(expression, df, sort=sort, row_limit=row_limit)
        assert len(multipliers) == len(parents)

        assert multipliers[-1] == 1
        self.df = df
        self.label = 'SHOULD_NOT_BE_USED'
        self.expression = expression
        # efficient way to find the original bin values (parent.bin_value) from the 'compressed'
        # self.bin_values
        df = vaex.from_dict({'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values})
        df[f'index_0'] = df['bin_value'] // multipliers[0]
        df[f'leftover_0'] = df[f'bin_value'] % multipliers[0]
        for i in range(1, len(multipliers)):
            df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i]
            df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i]
        columns = [f'index_{i}' for i in range(len(multipliers))]
        indices_parents = df.evaluate(columns)
        def compress(ar):
            if vaex.dtype_of(ar).kind == 'i':
                ar = vaex.array_types.to_numpy(ar)
                max_value = ar.max()
                ar = ar.astype(vaex.utils.required_dtype_for_max(max_value))
                return ar
        indices_parents = [compress(ar) for ar in indices_parents]
        bin_values = {}
        # NOTE: we can also use dict encoding instead of take
        for indices, parent in zip(indices_parents, parents):
            dtype = vaex.dtype_of(parent.bin_values)
            if dtype.is_struct:
                # collapse parent struct into our flat struct
                for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()):
                    bin_values[field.name] = ar.take(indices)
                    # bin_values[field.name] = pa.DictionaryArray.from_arrays(indices, ar)
            else:
                bin_values[parent.label] = parent.bin_values.take(indices)
                # bin_values[parent.label] = pa.DictionaryArray.from_arrays(indices, parent.bin_values)
        self.bin_values = pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
示例#20
0
def _from_dataframe_to_vaex(df: DataFrameObject) -> vaex.dataframe.DataFrame:
    """
    Note: we need to implement/test support for bit/byte masks, chunk handling, etc.
    """
    # Iterate through the chunks
    dataframe = []
    _buffers = []
    for chunk in df.get_chunks():

        # We need a dict of columns here, with each column being an expression.
        columns = dict()
        _k = _DtypeKind
        _buffers_chunks = []  # hold on to buffers, keeps memory alive
        for name in chunk.column_names():
            if not isinstance(name, str):
                raise ValueError(f"Column {name} is not a string")
            if name in columns:
                raise ValueError(f"Column {name} is not unique")

            col = chunk.get_column_by_name(name)
            if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
                # Simple numerical or bool dtype, turn into arrow array
                columns[name], _buf = convert_column_to_ndarray(col)
            elif col.dtype[0] == _k.CATEGORICAL:
                columns[name], _buf = convert_categorical_column(col)
            elif col.dtype[0] == _k.STRING:
                columns[name], _buf = convert_string_column(col)
            else:
                raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")

            _buffers_chunks.append(_buf)

        dataframe.append(vaex.from_dict(columns))
        # chunk buffers are added to list of all buffers
        _buffers.append(_buffers_chunks)

    if df.num_chunks() == 1:
        _buffers = _buffers[0]

    df_new = vaex.concat(dataframe)
    df_new._buffers = _buffers
    return df_new
示例#21
0
def test_add_invalid_name(tmpdir):
    # support invalid names and keywords
    df = vaex.from_dict({'X!1': x, 'class': x*2})
    assert str(df['X!1']) != 'X!1', "invalid identifier cannot be an expression"
    assert str(df['class']) != 'class', "keyword cannot be an expression"
    assert df.get_column_names() == ['X!1', 'class']
    assert df['X!1'].tolist() == x.tolist()
    assert (df['X!1']*2).tolist() == (x*2).tolist()
    assert (df['class']).tolist() == (x*2).tolist()
    assert (df.copy()['X!1']*2).tolist() == (x*2).tolist()

    path = str(tmpdir.join('test.hdf5'))
    df.export(path)
    df = vaex.open(path)
    assert df['X!1'].tolist() == x.tolist()
    assert (df.copy()['X!1']*2).tolist() == (x*2).tolist()
    assert (df[['X!1']]['X!1']*2).tolist() == (x*2).tolist()

    df_concat = vaex.concat([df, df])
    assert (df_concat[['X!1']]['X!1']*2).tolist() == ((x*2).tolist() + (x*2).tolist())
示例#22
0
def test_add_invalid_name(tmpdir):
    # support invalid names and keywords
    df = vaex.from_dict({'X!1': x, 'class': x*2})
    assert df.get_column_names() == ['X!1', 'class']
    assert df.get_column_names(alias=False) != ['X!1', 'class']
    assert df['X!1'].tolist() == x.tolist()
    assert (df['X!1']*2).tolist() == (x*2).tolist()
    assert (df['class']).tolist() == (x*2).tolist()
    assert 'X!1' in df._column_aliases
    assert (df.copy()['X!1']*2).tolist() == (x*2).tolist()

    path = str(tmpdir.join('test.hdf5'))
    df.export(path)
    df = vaex.open(path)
    assert df['X!1'].tolist() == x.tolist()
    assert (df.copy()['X!1']*2).tolist() == (x*2).tolist()
    assert (df[['X!1']]['X!1']*2).tolist() == (x*2).tolist()

    df_concat = vaex.concat([df, df])
    assert (df_concat[['X!1']]['X!1']*2).tolist() == ((x*2).tolist() + (x*2).tolist())
示例#23
0
    def predict(self, instances, **kwargs):

        if isinstance(instances[0], list):
            data = np.asarray(instances).T
            df = vaex.from_arrays(Arrival_Time=data[0],
                                  Creation_Time=data[1],
                                  x=data[2],
                                  y=data[3],
                                  z=data[4])

        elif isinstance(instances[0], dict):
            dfs = []
            for instance in instances:
                df = vaex.from_dict(instance)
                dfs.append(df)
            df = vaex.concat(dfs)

        else:
            return ['invalid input format']

        df.state_set(self.state, set_filter=False)
        return df.pred_name.tolist()
示例#24
0
def predict(data: Data):
    instances = data.instances

    if isinstance(instances[0], list):
        data = np.asarray(instances).T
        df = vaex.from_arrays(Arrival_Time=data[0],
                              Creation_Time=data[1],
                              x=data[2],
                              y=data[3],
                              z=data[4])

    elif isinstance(instances[0], dict):
        dfs = []
        for instance in instances:
            df = vaex.from_dict(instance)
            dfs.append(df)
        df = vaex.concat(dfs)

    else:
        return {'predictions': 'invalid input format'}

    df.state_set(global_items['state'], set_filter=False)
    return {'predictions': df.pred_name.tolist()}
示例#25
0
 def agg(self, actions):
     # TODO: this basically forms a cartesian product, we can do better, use a
     # 'multistage' hashmap
     arrays = super(GroupBy, self)._agg(actions)
     # we don't want non-existing pairs (e.g. Amsterdam in France does not exist)
     count_agg = vaex.agg.count()
     counts = self.df._agg(count_agg, self.grid, delay=_USE_DELAY)
     self.df.execute()
     if _USE_DELAY:
         arrays = {key: value.get() for key, value in arrays.items()}
         counts = counts.get()
     # take out the edges
     arrays = {key: vaex.utils.extract_central_part(value) for key, value in arrays.items()}
     counts = vaex.utils.extract_central_part(counts)
     mask = counts > 0
     coords = [coord[mask] for coord in np.meshgrid(*self.coords1d, indexing='ij')]
     labels = {str(by.expression): coord for by, coord in zip(self.by, coords)}
     df_grouped = vaex.from_dict(labels)
     for key, value in arrays.items():
         df_grouped[key] = value[mask]
     for key, value in arrays.items():
         df_grouped[key] = value[mask]
     return df_grouped
示例#26
0
def test_from_dict():
    data = {'A': [1, 2, 3], 'B': ['a', 'b', 'c']}
    ds = vaex.from_dict(data)
    assert 'A' in ds.get_column_names()
    assert ds['A'].values[0] == 1
    assert ds['B'].values[2] == 'c'
示例#27
0
def vaex_vertices_from_plyfile(filename):
    """Load vertices from plyfile and return as vaex DataFrame."""
    xyz = vertex_dict_from_plyfile(filename)
    return vx.from_dict(xyz)
示例#28
0
def make_prediction(data):
    st_df = MlConfig.st_df  # stop_time_data
    hm_df = MlConfig.hm_df  # historical means dataset
    model = MlConfig.state_model  # GTFSR vaex model state

    empty = ("", "")

    if not "start_time" in data or not "start_date" in data:
        return empty

    formatted_data = {
        "route_id": [str(data["route_id"])],
        "direction": [int(data["direction"])],
        "stop_sequence": [int(data["stop_sequence"])],
        "stop_id": [str(data["stop_id"])],
        "start_time": [str(data["start_time"])],
        "start_date": [int(data["start_date"])],
        "timestamp": [str(data["timestamp"])],
        "arrival": [int(data["arrival"] / 60)],
    }

    live_df = vaex.from_dict(formatted_data)

    live_df["arr_dow"] = live_df.start_date.apply(
        lambda d: get_dt(d, "%Y%m%d").weekday())
    live_df.materialize("arr_dow", inplace=True)

    # print(live_df.dtypes, "\n", st_df.dtypes, "\n", hm_df.dtypes, "\n")

    temp_df = st_df[
        (st_df["route_id"] == live_df[["route_id"]][0][0])
        & (st_df["stop_sequence"] == live_df[["stop_sequence"]][0][0])
        & (st_df["stop_id"] == live_df[["stop_id"]][0][0])
        & (st_df["start_time"] == live_df[["start_time"]][0][0])
        & (st_df["direction"] == live_df[["direction"]][0][0])].copy()

    if len(temp_df) < 1:
        return empty

    # join stop time data, filtering improves speed by only copying relevant rows
    cols = ["route_id", "stop_sequence", "stop_id", "start_time", "direction"]
    live_df = vaex_mjoin(live_df,
                         temp_df,
                         cols,
                         cols,
                         how="inner",
                         allow_duplication=True)

    live_df["keep_trip"] = live_df.apply(
        lambda sd, dow: sd.replace("[", "").replace("]", "").replace(" ", "").
        split(",")[dow],
        ["service_days", "arr_dow"],
    )
    live_df = live_df[live_df.keep_trip == "True"]
    live_df.drop(["service_days", "keep_trip"], inplace=True)

    if len(live_df) < 1:
        return empty

    live_df["arr_hour"] = live_df["arrival_time"].apply(
        lambda t: get_dt(t, "%H:%M:%S").hour)
    live_df.materialize("arr_hour", inplace=True)

    # join the historical means to our dataset
    temp_df = hm_df[(hm_df["route_id"] == data["route_id"])
                    & (hm_df["stop_id"] == data["stop_id"])
                    & (hm_df["arr_dow"] == live_df[["arr_dow"]][0][0])
                    & (hm_df["arr_hour"] == live_df[["arr_hour"]][0][0])
                    & (hm_df["direction"] == int(data["direction"]))
                    & (hm_df["stop_sequence"] == live_df[["stop_sequence"
                                                          ]][0][0])].copy()

    if len(temp_df) < 1:
        return empty

    cols = [
        "route_id", "stop_id", "arr_dow", "arr_hour", "direction",
        "stop_sequence"
    ]
    live_df = vaex_mjoin(
        live_df,
        temp_df,
        cols,
        cols,
        how="inner",
    )

    if len(live_df) < 1:
        return empty

    # assert same type
    live_df["direction"] = live_df["direction"].astype("int64")
    live_df["shape_dist_traveled"] = live_df["shape_dist_traveled"].astype(
        "float64")
    live_df["lat"] = live_df["lat"].astype("float64")
    live_df["lon"] = live_df["lon"].astype("float64")
    live_df["direction_angle"] = live_df["direction_angle"].astype("float64")
    live_df["shape_dist_between"] = live_df["shape_dist_between"].astype(
        "float64")

    # materialize virtual columns to match model state
    [
        live_df.materialize(col, inplace=True)
        for col in live_df.get_column_names()
        if not col in live_df.get_column_names(virtual=False)
    ]
    try:
        live_df.state_set(model)

        if len(live_df) > 0:
            return (round(live_df[["p_arrival_lgbm"]][0][0]) *
                    60), live_df[["p_arrival_lgbm"]][0][0]
    except:
        return empty
    return empty
示例#29
0
def test_unicode_names():
    x = np.arange(10)
    df = vaex.from_dict({'远': x})
    assert df.远.tolist() == x.tolist()
示例#30
0
def test_not_hide_invalid_name():
    x = np.arange(10)
    df = vaex.from_dict({'./bla': x})
    assert len(df.get_column_names()) == 1
    assert df['./bla'].tolist() == x.tolist()
示例#31
0
def test_hdf5_with_alias(tmpdir):
    df = vaex.from_dict({'X-1': [1], '#': [2]})
    path = DATA_PATH / 'with_alias.hdf5'
    df = vaex.open(str(path))
    assert df['X-1'].tolist() == [1]
    assert df['#'].tolist() == [2]
示例#32
0
def test_random_projections(n_components, matrix_type):
    df = vaex.from_dict(data=data_maker(n_rows=100_000, n_cols=31))