示例#1
0
 def out_dshape(self, input_dshape):
     cats = self.categorizer.categories(input_dshape)
     red_shape = self.reduction.out_dshape(input_dshape)
     return dshape(Record([(c, red_shape) for c in cats]))
示例#2
0
 def schema(self):
     d = self._child.schema[0].dict
     return DataShape(Record([(name, d[name]) for name in self.fields]))
示例#3
0
 def schema(self):
     subs = dict(self.labels)
     param = self._child.dshape.measure.parameters[0]
     return DataShape(
         Record([[subs.get(name, name), dtype] for name, dtype in param]))
示例#4
0
 def out_dshape(self, in_dshape):
     return dshape(
         Record([(k, v.out_dshape(in_dshape))
                 for (k, v) in zip(self.keys, self.values)]))
示例#5
0
 def test_empty_struct(self):
     self.assertEqual(dshape('{}'), DataShape(Record([])))
示例#6
0
 def _schema(self):
     measure = self._child.schema.measure
     d = getattr(measure, 'value', measure).dict
     return DataShape(Record((name, d[name]) for name in self.fields))
示例#7
0
 def out_dshape(self, input_dshape):
     cats = input_dshape.measure[self.column].categories
     return dshape(Record([(c, ct.int32) for c in cats]))
示例#8
0
def discover_h5py_group_file(g):
    return DataShape(Record([[k, discover(v)] for k, v in g.items()]))
示例#9
0
def Data(data,
         dshape=None,
         name=None,
         fields=None,
         columns=None,
         schema=None,
         **kwargs):
    if columns:
        raise ValueError("columns argument deprecated, use fields instead")
    if schema and dshape:
        raise ValueError("Please specify one of schema= or dshape= keyword"
                         " arguments")

    sub_uri = ''
    if isinstance(data, _strtypes):
        if '::' in data:
            data, sub_uri = data.split('::')
        data = resource(data,
                        schema=schema,
                        dshape=dshape,
                        columns=columns,
                        **kwargs)
    if (isinstance(data, Iterator)
            and not isinstance(data, tuple(not_an_iterator))):
        data = tuple(data)
    if schema and not dshape:
        dshape = var * schema
    if dshape and isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)
    if not dshape:
        dshape = discover(data)
        types = None
        if isinstance(dshape.measure, Tuple) and fields:
            types = dshape[1].dshapes
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))
        elif isscalar(dshape.measure) and fields:
            types = (dshape.measure, ) * int(dshape[-2])
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape[:-1] + (schema, )))
        elif isrecord(dshape.measure) and fields:
            ds = discover(data)
            assert isrecord(ds.measure)
            names = ds.measure.names
            if names != fields:
                raise ValueError(
                    'data column names %s\n'
                    '\tnot equal to fields parameter %s,\n'
                    '\tuse Data(data).relabel(%s) to rename '
                    'fields' %
                    (names, fields, ', '.join('%s=%r' % (k, v)
                                              for k, v in zip(names, fields))))
            types = dshape.measure.types
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))

    ds = datashape.dshape(dshape)
    result = InteractiveSymbol(data, ds, name)

    if sub_uri:
        for field in sub_uri.split('/'):
            if field:
                result = result[field]

    return result
示例#10
0
def discover_row_proxy(rp):
    return Record(list(zip(rp.keys(), map(discover, rp.values()))))
示例#11
0
    def _schema(self):
        """

        Examples
        --------
        >>> from blaze import symbol
        >>> t = symbol('t', 'var * {name: string, amount: int}')
        >>> s = symbol('t', 'var * {name: string, id: int}')

        >>> join(t, s).schema
        dshape("{name: string, amount: int32, id: int32}")

        >>> join(t, s, how='left').schema
        dshape("{name: string, amount: int32, id: ?int32}")

        Overlapping but non-joined fields append _left, _right

        >>> a = symbol('a', 'var * {x: int, y: int}')
        >>> b = symbol('b', 'var * {x: int, y: int}')
        >>> join(a, b, 'x').fields
        ['x', 'y_left', 'y_right']
        """

        option = lambda dt: dt if isinstance(dt, Option) else Option(dt)

        on_left = self.on_left
        if not isinstance(on_left, list):
            on_left = on_left,

        on_right = self.on_right
        if not isinstance(on_right, list):
            on_right = on_right,

        right_types = keymap(
            dict(zip(on_right, on_left)).get,
            self.rhs.dshape.measure.dict,
        )
        joined = ((name, promote(dt, right_types[name], promote_option=False))
                  for n, (name, dt) in enumerate(
                      filter(
                          compose(op.contains(on_left), first),
                          self.lhs.dshape.measure.fields,
                      )))

        left = [(name, dt) for name, dt in zip(
            self.lhs.fields, types_of_fields(self.lhs.fields, self.lhs))
                if name not in on_left]

        right = [(name, dt) for name, dt in zip(
            self.rhs.fields, types_of_fields(self.rhs.fields, self.rhs))
                 if name not in on_right]

        # Handle overlapping but non-joined case, e.g.
        left_other = set(name for name, dt in left if name not in on_left)
        right_other = set(name for name, dt in right if name not in on_right)
        overlap = left_other & right_other

        left_suffix, right_suffix = self.suffixes
        left = ((name + left_suffix if name in overlap else name, dt)
                for name, dt in left)
        right = ((name + right_suffix if name in overlap else name, dt)
                 for name, dt in right)

        if self.how in ('right', 'outer'):
            left = ((name, option(dt)) for name, dt in left)
        if self.how in ('left', 'outer'):
            right = ((name, option(dt)) for name, dt in right)

        return dshape(Record(chain(joined, left, right)))
示例#12
0
def discover_sqlalchemy_column(c):
    meta = Option if getattr(c, 'nullable', True) else identity
    return Record([(c.name, meta(discover(c.type)))])
示例#13
0
def _bound_symbol(cls, data_source, dshape, name, fields, schema, **kwargs):
    if schema and dshape:
        raise ValueError(
            'Please specify one of schema= or dshape= keyword  arguments', )

    if isinstance(data_source, BoundSymbol):
        return _bound_symbol(cls, data_source.data, dshape, name, fields,
                             schema, **kwargs)

    if schema and not dshape:
        dshape = var * schema
    if dshape and isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)

    if isinstance(data_source, _strtypes):
        data_source = resource(data_source,
                               schema=schema,
                               dshape=dshape,
                               **kwargs)

    if (isinstance(data_source, Iterator)
            and not isinstance(data_source, tuple(not_an_iterator))):
        data_source = tuple(data_source)

    if not dshape:
        dshape = discover(data_source)
        types = None
        if isinstance(dshape.measure, Tuple) and fields:
            types = dshape[1].dshapes
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))
        elif isscalar(dshape.measure) and fields:
            types = (dshape.measure, ) * int(dshape[-2])
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape[:-1] + (schema, )))
        elif isrecord(dshape.measure) and fields:
            ds = discover(data_source)
            assert isrecord(ds.measure)
            names = ds.measure.names
            if names != fields:
                raise ValueError(
                    'data column names %s\n'
                    '\tnot equal to fields parameter %s,\n'
                    '\tuse data(data_source).relabel(%s) to rename '
                    'fields' % (
                        names,
                        fields,
                        ', '.join('%s=%r' % (k, v)
                                  for k, v in zip(names, fields)),
                    ), )
            types = dshape.measure.types
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))

    ds = datashape.dshape(dshape)

    if name is generate:
        if istabular(dshape):
            name = next(_names)
        else:
            name = None

    return cls(data_source, ds, name)
示例#14
0
    def test_id_take_last_in_group_macro(self):
        """
        output (expected):

                                   other  value
        2014-01-01 Equity(65 [A])    NaN      1
                   Equity(66 [B])    NaN      1
                   Equity(67 [C])    NaN      1
        2014-01-02 Equity(65 [A])      1      2
                   Equity(66 [B])      1      2
                   Equity(67 [C])      1      2
        2014-01-03 Equity(65 [A])      2      2
                   Equity(66 [B])      2      2
                   Equity(67 [C])      2      2
         """
        T = pd.Timestamp
        df = pd.DataFrame(
            columns=['asof_date', 'timestamp', 'other', 'value'],
            data=[
                [T('2014-01-01'),
                 T('2014-01-01 00'), np.nan, 1],
                [T('2014-01-01'),
                 T('2014-01-01 01'), np.nan, np.nan],
                [T('2014-01-02'),
                 T('2014-01-02 00'), 1, np.nan],
                [T('2014-01-02'),
                 T('2014-01-02 01'), np.nan, 2],
                [T('2014-01-03'),
                 T('2014-01-03 00'), 2, np.nan],
                [T('2014-01-03'), T('2014-01-03 01'), 3, 3],
            ],
        )
        fields = OrderedDict(self.macro_dshape.measure.fields)
        fields['other'] = fields['value']

        with tmp_asset_finder() as finder:
            expected = pd.DataFrame(
                columns=[
                    'other',
                    'value',
                ],
                data=[
                    [np.nan, 1],  # 2014-01-01 Equity(65 [A])
                    [np.nan, 1],  # Equity(66 [B])
                    [np.nan, 1],  # Equity(67 [C])
                    [1, 2],  # 2014-01-02 Equity(65 [A])
                    [1, 2],  # Equity(66 [B])
                    [1, 2],  # Equity(67 [C])
                    [2, 2],  # 2014-01-03 Equity(65 [A])
                    [2, 2],  # Equity(66 [B])
                    [2, 2],  # Equity(67 [C])
                ],
                index=pd.MultiIndex.from_product(
                    (self.dates, finder.retrieve_all(self.sids)), ),
            )
            self._test_id(
                df,
                var * Record(fields),
                expected,
                finder,
                ('value', 'other'),
            )
示例#15
0
    def test_id_take_last_in_group(self):
        T = pd.Timestamp
        df = pd.DataFrame(
            columns=['asof_date', 'timestamp', 'sid', 'other', 'value'],
            data=[
                [T('2014-01-01'),
                 T('2014-01-01 00'), 65, 0, 0],
                [T('2014-01-01'),
                 T('2014-01-01 01'), 65, 1, np.nan],
                [T('2014-01-01'),
                 T('2014-01-01 00'), 66, np.nan, np.nan],
                [T('2014-01-01'),
                 T('2014-01-01 01'), 66, np.nan, 1],
                [T('2014-01-01'),
                 T('2014-01-01 00'), 67, 2, np.nan],
                [T('2014-01-01'),
                 T('2014-01-01 01'), 67, np.nan, np.nan],
                [T('2014-01-02'),
                 T('2014-01-02 00'), 65, np.nan, np.nan],
                [T('2014-01-02'),
                 T('2014-01-02 01'), 65, np.nan, 1],
                [T('2014-01-02'),
                 T('2014-01-02 00'), 66, np.nan, np.nan],
                [T('2014-01-02'),
                 T('2014-01-02 01'), 66, 2, np.nan],
                [T('2014-01-02'),
                 T('2014-01-02 00'), 67, 3, 3],
                [T('2014-01-02'),
                 T('2014-01-02 01'), 67, 3, 3],
                [T('2014-01-03'),
                 T('2014-01-03 00'), 65, 2, np.nan],
                [T('2014-01-03'),
                 T('2014-01-03 01'), 65, 2, np.nan],
                [T('2014-01-03'),
                 T('2014-01-03 00'), 66, 3, 3],
                [T('2014-01-03'),
                 T('2014-01-03 01'), 66, np.nan, np.nan],
                [T('2014-01-03'),
                 T('2014-01-03 00'), 67, np.nan, np.nan],
                [T('2014-01-03'),
                 T('2014-01-03 01'), 67, np.nan, 4],
            ],
        )
        fields = OrderedDict(self.dshape.measure.fields)
        fields['other'] = fields['value']

        with tmp_asset_finder() as finder:
            expected = pd.DataFrame(
                columns=['other', 'value'],
                data=[
                    [1, 0],  # 2014-01-01 Equity(65 [A])
                    [np.nan, 1],  # Equity(66 [B])
                    [2, np.nan],  # Equity(67 [C])
                    [1, 1],  # 2014-01-02 Equity(65 [A])
                    [2, 1],  # Equity(66 [B])
                    [3, 3],  # Equity(67 [C])
                    [2, 1],  # 2014-01-03 Equity(65 [A])
                    [3, 3],  # Equity(66 [B])
                    [3, 3],  # Equity(67 [C])
                ],
                index=pd.MultiIndex.from_product(
                    (self.dates, finder.retrieve_all(self.sids)), ),
            )
            self._test_id(
                df,
                var * Record(fields),
                expected,
                finder,
                ('value', 'other'),
            )
示例#16
0
 def out_dshape(self, input_dshape):
     cats = input_dshape.measure[self.cat_column].categories
     red_shape = self.reduction.out_dshape(input_dshape)
     return dshape(Record([(c, red_shape) for c in cats]))