Exemplo n.º 1
0
def test_categorical_component():
    c = CategoricalComponent(['a', 'b', 'c', 'a', 'b'], categories=['a', 'b', 'c'])
    c2 = clone(c)
    assert isinstance(c2, CategoricalComponent)
    np.testing.assert_array_equal(c.codes, [0, 1, 2, 0, 1])
    np.testing.assert_array_equal(c.labels, ['a', 'b', 'c', 'a', 'b'])
    np.testing.assert_array_equal(c.categories, ['a', 'b', 'c'])
Exemplo n.º 2
0
def panda_process(indf):
    """
    Build a data set from a table using pandas. This attempts to respect
    categorical data input by letting pandas.read_csv infer the type

    """
    result = Data()
    for name, column in indf.iteritems():
        if (column.dtype == np.object) | (column.dtype == np.bool):
            # try to salvage numerical data
            coerced = column.convert_objects(convert_numeric=True)
            if (coerced.dtype !=
                    column.dtype) and coerced.isnull().mean() < 0.4:
                c = Component(coerced.values)
            else:
                # pandas has a 'special' nan implementation and this doesn't
                # play well with np.unique
                c = CategoricalComponent(column.fillna(''))
        else:
            c = Component(column.values)

        # convert header to string - in some cases if the first row contains
        # numbers, these are cast to numerical types, so we want to change that
        # here.
        if not isinstance(name, six.string_types):
            name = str(name)

        # strip off leading #
        name = name.strip()
        if name.startswith('#'):
            name = name[1:].strip()

        result.add_component(c, name)

    return result
Exemplo n.º 3
0
def mosviz_tabular_data(*args, **kwargs):
    """
     Build a data set from a table. We restrict ourselves to tables
     with 1D columns.

     All arguments are passed to
         astropy.table.Table.read(...).
     """

    result = Data()

    table = astropy_table_read(*args, **kwargs)

    result.meta = table.meta

    # Loop through columns and make component list
    for column_name in table.columns:
        c = table[column_name]
        u = c.unit if hasattr(c, 'unit') else c.units

        if table.masked:
            # fill array for now
            try:
                c = c.filled(fill_value=np.nan)
            except (ValueError, TypeError):  # assigning nan to integer dtype
                c = c.filled(fill_value=-1)

        dtype = c.dtype.type
        if dtype is np.string_ or dtype is np.str_:
            nc = CategoricalComponent(c, units=u)
        else:
            nc = Component.autotyped(c, units=u)
        result.add_component(nc, column_name)

    return result
Exemplo n.º 4
0
def _load_categorical_component(rec, context):
    if 'log' in rec:
        return context.object(rec['log']).component(rec['log_item'])

    return CategoricalComponent(categorical_data=context.object(rec['categorical_data']),
                                categories=context.object(rec['categories']),
                                jitter=context.object(rec['jitter_method']),
                                units=rec['units'])
Exemplo n.º 5
0
 def setup_method(self, method):
     self.data = Data(y=[-1, -1, -1, -2, -2, -2, -3, -5, -7])
     self.data.add_component(
         CategoricalComponent(['a', 'a', 'a', 'b', 'c', 'd', 'd', 'e',
                               'f']), 'x')
     self.subset = self.data.new_subset()
     self.collect = DataCollection(self.data)
     self.client = HistogramClient(self.collect, FIGURE)
     self.axes = self.client.axes
     FIGURE.canvas.draw = MagicMock()
     assert FIGURE.canvas.draw.call_count == 0
Exemplo n.º 6
0
    def test_high_cardinatility_timing(self):

        card = 50000
        data = Data()
        card_data = [str(num) for num in range(card)]
        data.add_component(Component(np.arange(card * 5)), 'y')
        data.add_component(CategoricalComponent(np.repeat([card_data], 5)),
                           'xcat')
        self.add_data(data)
        comp = data.find_component_id('xcat')
        timer_func = partial(self.client._set_xydata, 'x', comp)

        timer = timeit(timer_func, number=1)
        assert timer < 3  # this is set for Travis speed
Exemplo n.º 7
0
    def test_ticks_go_back_after_changing(self):
        """ If you change to a categorical axis and then change back
        to a numeric, the axis ticks should fix themselves properly.
        """
        data = Data()
        data.add_component(Component(np.arange(100)), 'y')
        data.add_component(CategoricalComponent(['a'] * 50 + ['b'] * 50),
                           'xcat')
        data.add_component(Component(2 * np.arange(100)), 'xcont')

        self.add_data(data=data)
        self.client.yatt = data.find_component_id('y')
        self.client.xatt = data.find_component_id('xcat')
        self.check_ticks(self.client.axes.xaxis, False, True)
        self.check_ticks(self.client.axes.yaxis, False, False)

        self.client.xatt = data.find_component_id('xcont')
        self.check_ticks(self.client.axes.yaxis, False, False)
        self.check_ticks(self.client.axes.xaxis, False, False)
Exemplo n.º 8
0
def pandas_to_glue(df, label='data', name_map=default_name_map):
    """Convert dataframe to glue.core.data.Data. Glue categorical variables require hashing,
    store array of unhashable components in ComponentID._unhashable. Override column names
    in name_map with dictionary values.

    """

    data = Data(label=label)
    for c in df.columns:
        c_name = map_column_names(c)
        try:
            data.add_component(df[c], c_name)
        except TypeError:
            # pd.factorize error with int list input to CategoricalComponent
            r = ['%09d' % i for i in range(len(df[c]))]
            cc = CategoricalComponent(r)
            c_id = ComponentID(c_name)
            c_id._unhashable = np.array(df[c])
            data.add_component(cc, c_id)
    return data