def create_indexed_columns(self, transform_data: str = '', apply_transforms: bool = True) -> List[Column]: (a, b, c), c_col_index = self.create_variables_and_c_colindex( transform_data=transform_data, apply_transforms=apply_transforms) ac = Column(a, 'a', indices=[c_col_index]) bc = Column(b, 'b', indices=[c_col_index]) cc = Column(c, 'c') return [ac, bc, cc]
def create_columns(self, transform_data: str = '', apply_transforms: bool = True) -> List[Column]: a, b, c = self.create_variables(transform_data=transform_data, apply_transforms=apply_transforms) ac = Column(a, 'a') bc = Column(b, 'b') cc = Column(c, 'c') return [ac, bc, cc]
def test_load_with_categorical(self): self.create_csv() all_cols = self.create_columns() a, b, c = self.create_variables() all_cols[2] = Column(c, 'c', dtype=StringType(categorical=True)) ds = self.create_source(df=None, columns=all_cols) assert_frame_equal(ds.df, self.expect_loaded_df_categorical)
def test_load_with_columns_transforms_and_pre_applied_transforms(self): self.create_csv() all_cols = self.create_columns(transform_data='cell') a, b, c = self.create_variables(transform_data='cell') all_cols[0] = Column(a, 'a', applied_transform_keys=['add_one_cell']) ds = self.create_source(df=None, columns=all_cols) assert_frame_equal( ds.df, self.expect_loaded_df_with_transform_and_a_pre_transformed)
def try_to_calculate_variables(self, df: pd.DataFrame): logger.debug( f'Trying to calculate variables for source {self.source.name} in loader {self}' ) if not self.source.columns: return df # Create temporary source so that transform can have access to df and all columns with one object self.source.df = df for variable in self.source.load_variables: if variable.key in self.source.col_var_keys: # Variable already exists in the data, either from original source or previously calculated continue if variable.calculation is None: raise ValueError( f'passed variable {variable} but not calculated and not ' f'in columns {self.source.columns}') required_variables = variable.calculation.variables has_all_required_variables = True calc_with_cols = [] for req_var in required_variables: if not has_all_required_variables: break col = self.source.col_for(req_var) calc_with_cols.append(col) col_pre_applied_transform_keys = deepcopy( col.applied_transform_keys) for transform in req_var.applied_transforms: # Need to make sure all the same transforms have been applied to # the column before the calculation if transform.key in col_pre_applied_transform_keys: col_pre_applied_transform_keys.remove(transform.key) else: has_all_required_variables = False break if has_all_required_variables: # Actually do calculation new_series = variable.calculation.func(calc_with_cols) new_series.name = variable.name # TODO [#34]: determine how to set index for columns from calculated variables new_col = Column(variable, dtype=str(new_series.dtype), series=new_series) self.source.df[variable.name] = new_series self.source.columns.append(new_col) return self.source.df
def test_load_with_datetime(self): test_df = self.test_df.copy() test_df['d'] = pd.to_datetime('1/1/2000') self.create_csv(df=test_df) expect_df = self.expect_loaded_df_rename_only.copy() expect_df['Date'] = pd.to_datetime('1/1/2000') date_var = Variable('Date', dtype='datetime') date_col = Column(date_var, 'd') all_cols = self.create_columns() all_cols.append(date_col) ds = self.create_source(df=None, columns=all_cols) assert_frame_equal(ds.df, expect_df)
def _apply_transforms_to_var(var: 'Variable', column: Column, source: 'DataSource') -> 'DataSource': col_pre_applied_transform_keys = deepcopy(column.applied_transform_keys) for transform in var.applied_transforms: if transform.key in col_pre_applied_transform_keys: # Transformation was already applied in the saved data source, skip this transformation # remove from applied transformations, because same transformation may be applied multiple times. # If desired transformation happens twice, and it is only once in the source column, will still # need to apply it once col_pre_applied_transform_keys.remove(transform.key) continue source = transform._apply_transform_for_column_and_variable_to_source( source, column, var) column.applied_transform_keys.append(transform.key) column.variable = var # overwrite untransformed variable with transformed variable return source
def _create_series_in_df_for_calculation(self, df: pd.DataFrame, col: Column): new_key = str(uuid.uuid4()) # temporary key for this variable # should get column which already has data for this variable existing_col = self.col_for(col.variable) df[new_key] = deepcopy(df[existing_col.load_key]) col.load_key = new_key