def _load_comments(self, label): """ Populate the comments and flag columns. Attempt to load comments from file. Parameters ---------- label : str The label of the data in session.data_collection. """ #Make sure its the right data #(beacuse subset data is masked) idx = self._data_collection_index(label) if idx == -1: return False data = self.session.data_collection[idx] #Fill in default comments: length = data.shape[0] new_comments = np.array(["" for i in range(length)], dtype=object) new_flags = np.array(["0" for i in range(length)], dtype=object) #Fill in any saved comments: meta = data.meta obj_names = data.get_component( self.catalog.meta["special_columns"]["source_id"]).labels if "MOSViz_comments" in meta.keys(): try: comments = meta["MOSViz_comments"] for key in comments.keys(): index = self._id_to_index_hash(key, obj_names) if index is not None: line = comments[key] new_comments[index] = line except Exception as e: print("MOSViz Comment Load Failed: ", e) if "MOSViz_flags" in meta.keys(): try: flags = meta["MOSViz_flags"] for key in flags.keys(): index = self._id_to_index_hash(key, obj_names) if index is not None: line = flags[key] new_flags[index] = line except Exception as e: print("MOSViz Flag Load Failed: ", e) #Send to DC data.add_component(CategoricalComponent(new_flags, "flag"), "flag") data.add_component(CategoricalComponent(new_comments, "comments"), "comments") return True
def test_categorical_data(): data = Data(label="Test Cat Data 1") data2 = Data(label="Teset Cat Data 2") comp_x1 = CategoricalComponent(np.array(['a', 'a', 'b'])) comp_y1 = Component(np.array([1, 2, 3])) comp_x2 = CategoricalComponent(np.array(['c', 'a', 'b'])) comp_y2 = Component(np.array([1, 3, 5])) data.add_component(comp_x1, 'x1') data.add_component(comp_y1, 'y1') data2.add_component(comp_x2, 'x2') data2.add_component(comp_y2, 'y2') return data, data2
def test_categorical_component(): c = CategoricalComponent(['a', 'b', 'c', 'a', 'b'], categories=['a', 'b', 'c']) c2 = clone(c) assert isinstance(c2, CategoricalComponent) np.testing.assert_array_equal(c.codes, [0, 1, 2, 0, 1]) np.testing.assert_array_equal(c.labels, ['a', 'b', 'c', 'a', 'b']) np.testing.assert_array_equal(c.categories, ['a', 'b', 'c'])
def panda_process(indf): """ Build a data set from a table using pandas. This attempts to respect categorical data input by letting pandas.read_csv infer the type """ result = Data() for name, column in indf.iteritems(): if (column.dtype == np.object) | (column.dtype == np.bool): # try to salvage numerical data coerced = column.convert_objects(convert_numeric=True) if (coerced.dtype != column.dtype) and coerced.isnull().mean() < 0.4: c = Component(coerced.values) else: # pandas has a 'special' nan implementation and this doesn't # play well with np.unique c = CategoricalComponent(column.fillna('')) else: c = Component(column.values) # convert header to string - in some cases if the first row contains # numbers, these are cast to numerical types, so we want to change that # here. if not isinstance(name, six.string_types): name = str(name) # strip off leading # name = name.strip() if name.startswith('#'): name = name[1:].strip() result.add_component(c, name) return result
def mosviz_tabular_data(*args, **kwargs): """ Build a data set from a table. We restrict ourselves to tables with 1D columns. All arguments are passed to astropy.table.Table.read(...). """ result = Data() table = astropy_table_read(*args, **kwargs) result.meta = table.meta # Loop through columns and make component list for column_name in table.columns: c = table[column_name] u = c.unit if hasattr(c, 'unit') else c.units if table.masked: # fill array for now try: c = c.filled(fill_value=np.nan) except (ValueError, TypeError): # assigning nan to integer dtype c = c.filled(fill_value=-1) dtype = c.dtype.type if dtype is np.string_ or dtype is np.str_: nc = CategoricalComponent(c, units=u) else: nc = Component.autotyped(c, units=u) result.add_component(nc, column_name) return result
def _load_categorical_component(rec, context): if 'log' in rec: return context.object(rec['log']).component(rec['log_item']) return CategoricalComponent(categorical_data=context.object(rec['categorical_data']), categories=context.object(rec['categories']), jitter=context.object(rec['jitter_method']), units=rec['units'])
def setup_method(self, method): self.data = Data(y=[-1, -1, -1, -2, -2, -2, -3, -5, -7]) self.data.add_component( CategoricalComponent(['a', 'a', 'a', 'b', 'c', 'd', 'd', 'e', 'f']), 'x') self.subset = self.data.new_subset() self.collect = DataCollection(self.data) self.client = HistogramClient(self.collect, FIGURE) self.axes = self.client.axes FIGURE.canvas.draw = MagicMock() assert FIGURE.canvas.draw.call_count == 0
def test_high_cardinatility_timing(self): card = 50000 data = Data() card_data = [str(num) for num in range(card)] data.add_component(Component(np.arange(card * 5)), 'y') data.add_component(CategoricalComponent(np.repeat([card_data], 5)), 'xcat') self.add_data(data) comp = data.find_component_id('xcat') timer_func = partial(self.client._set_xydata, 'x', comp) timer = timeit(timer_func, number=1) assert timer < 3 # this is set for Travis speed
def test_ticks_go_back_after_changing(self): """ If you change to a categorical axis and then change back to a numeric, the axis ticks should fix themselves properly. """ data = Data() data.add_component(Component(np.arange(100)), 'y') data.add_component(CategoricalComponent(['a'] * 50 + ['b'] * 50), 'xcat') data.add_component(Component(2 * np.arange(100)), 'xcont') self.add_data(data=data) self.client.yatt = data.find_component_id('y') self.client.xatt = data.find_component_id('xcat') self.check_ticks(self.client.axes.xaxis, False, True) self.check_ticks(self.client.axes.yaxis, False, False) self.client.xatt = data.find_component_id('xcont') self.check_ticks(self.client.axes.yaxis, False, False) self.check_ticks(self.client.axes.xaxis, False, False)
def pandas_to_glue(df, label='data', name_map=default_name_map): """Convert dataframe to glue.core.data.Data. Glue categorical variables require hashing, store array of unhashable components in ComponentID._unhashable. Override column names in name_map with dictionary values. """ data = Data(label=label) for c in df.columns: c_name = map_column_names(c) try: data.add_component(df[c], c_name) except TypeError: # pd.factorize error with int list input to CategoricalComponent r = ['%09d' % i for i in range(len(df[c]))] cc = CategoricalComponent(r) c_id = ComponentID(c_name) c_id._unhashable = np.array(df[c]) data.add_component(cc, c_id) return data