예제 #1
0
 def create_indexed_columns(self,
                            transform_data: str = '',
                            apply_transforms: bool = True) -> List[Column]:
     (a, b, c), c_col_index = self.create_variables_and_c_colindex(
         transform_data=transform_data, apply_transforms=apply_transforms)
     ac = Column(a, 'a', indices=[c_col_index])
     bc = Column(b, 'b', indices=[c_col_index])
     cc = Column(c, 'c')
     return [ac, bc, cc]
예제 #2
0
 def create_columns(self,
                    transform_data: str = '',
                    apply_transforms: bool = True) -> List[Column]:
     a, b, c = self.create_variables(transform_data=transform_data,
                                     apply_transforms=apply_transforms)
     ac = Column(a, 'a')
     bc = Column(b, 'b')
     cc = Column(c, 'c')
     return [ac, bc, cc]
예제 #3
0
 def test_load_with_categorical(self):
     self.create_csv()
     all_cols = self.create_columns()
     a, b, c = self.create_variables()
     all_cols[2] = Column(c, 'c', dtype=StringType(categorical=True))
     ds = self.create_source(df=None, columns=all_cols)
     assert_frame_equal(ds.df, self.expect_loaded_df_categorical)
예제 #4
0
 def test_load_with_columns_transforms_and_pre_applied_transforms(self):
     self.create_csv()
     all_cols = self.create_columns(transform_data='cell')
     a, b, c = self.create_variables(transform_data='cell')
     all_cols[0] = Column(a, 'a', applied_transform_keys=['add_one_cell'])
     ds = self.create_source(df=None, columns=all_cols)
     assert_frame_equal(
         ds.df, self.expect_loaded_df_with_transform_and_a_pre_transformed)
예제 #5
0
    def try_to_calculate_variables(self, df: pd.DataFrame):
        logger.debug(
            f'Trying to calculate variables for source {self.source.name} in loader {self}'
        )
        if not self.source.columns:
            return df

        # Create temporary source so that transform can have access to df and all columns with one object
        self.source.df = df

        for variable in self.source.load_variables:
            if variable.key in self.source.col_var_keys:
                # Variable already exists in the data, either from original source or previously calculated
                continue

            if variable.calculation is None:
                raise ValueError(
                    f'passed variable {variable} but not calculated and not '
                    f'in columns {self.source.columns}')
            required_variables = variable.calculation.variables
            has_all_required_variables = True
            calc_with_cols = []
            for req_var in required_variables:
                if not has_all_required_variables:
                    break
                col = self.source.col_for(req_var)
                calc_with_cols.append(col)
                col_pre_applied_transform_keys = deepcopy(
                    col.applied_transform_keys)
                for transform in req_var.applied_transforms:
                    # Need to make sure all the same transforms have been applied to
                    # the column before the calculation
                    if transform.key in col_pre_applied_transform_keys:
                        col_pre_applied_transform_keys.remove(transform.key)
                    else:
                        has_all_required_variables = False
                        break

            if has_all_required_variables:
                # Actually do calculation
                new_series = variable.calculation.func(calc_with_cols)
                new_series.name = variable.name
                # TODO [#34]: determine how to set index for columns from calculated variables
                new_col = Column(variable,
                                 dtype=str(new_series.dtype),
                                 series=new_series)
                self.source.df[variable.name] = new_series
                self.source.columns.append(new_col)

        return self.source.df
예제 #6
0
    def test_load_with_datetime(self):
        test_df = self.test_df.copy()
        test_df['d'] = pd.to_datetime('1/1/2000')
        self.create_csv(df=test_df)

        expect_df = self.expect_loaded_df_rename_only.copy()
        expect_df['Date'] = pd.to_datetime('1/1/2000')

        date_var = Variable('Date', dtype='datetime')
        date_col = Column(date_var, 'd')
        all_cols = self.create_columns()
        all_cols.append(date_col)

        ds = self.create_source(df=None, columns=all_cols)
        assert_frame_equal(ds.df, expect_df)
예제 #7
0
def _apply_transforms_to_var(var: 'Variable', column: Column,
                             source: 'DataSource') -> 'DataSource':
    col_pre_applied_transform_keys = deepcopy(column.applied_transform_keys)
    for transform in var.applied_transforms:
        if transform.key in col_pre_applied_transform_keys:
            # Transformation was already applied in the saved data source, skip this transformation
            # remove from applied transformations, because same transformation may be applied multiple times.
            # If desired transformation happens twice, and it is only once in the source column, will still
            # need to apply it once
            col_pre_applied_transform_keys.remove(transform.key)
            continue
        source = transform._apply_transform_for_column_and_variable_to_source(
            source, column, var)
        column.applied_transform_keys.append(transform.key)
        column.variable = var  # overwrite untransformed variable with transformed variable
    return source
예제 #8
0
 def _create_series_in_df_for_calculation(self, df: pd.DataFrame, col: Column):
     new_key = str(uuid.uuid4())  # temporary key for this variable
     # should get column which already has data for this variable
     existing_col = self.col_for(col.variable)
     df[new_key] = deepcopy(df[existing_col.load_key])
     col.load_key = new_key