示例#1
0
 def reset(self, forward: bool = False):
     logger.debug(f'Resetting source {self.name}')
     del self._df
     self._df = None
     if forward:
         for item in self.forward_links:
             item.reset(forward=True)
示例#2
0
 def _set_df_from_first_operation(self):
     logger.debug(f'Setting pipeline {self.name} df from first operation')
     # Need to check as may be generation pipeline which would not have a df to start from
     if (hasattr(self.operations[0], 'data_sources')
             and self.operations[0].data_sources
             and self.operations[0].num_required_sources > 0):
         self.df = self.operations[0].data_sources[0].df
示例#3
0
    def rename_columns(self, df: pd.DataFrame):
        from datacode.models.source import NoColumnForVariableException
        if not self.source.columns:
            return

        logger.debug(
            f'Renaming columns in {self.source.name} in loader {self}')
        rename_dict = {}
        for variable in self.source._orig_load_variables:
            if variable.key not in self.source.col_var_keys:
                if variable.calculation is None:
                    raise ValueError(
                        f'passed variable {variable} but not calculated and not '
                        f'in columns {self.source.columns}')
                continue
            col = self.source.col_for(variable)
            rename_dict[col.load_key] = variable.name
            col.variable = variable
        for variable in self.source._vars_for_calculate:
            try:
                col = self.source.col_for(variable, for_calculate_only=True)
                rename_dict[col.load_key] = variable.name
                col.variable = variable
            except NoColumnForVariableException:
                # Must be using a pre-existing column rather than a newly generated column, need to rename that instead
                col = self.source.col_for(variable, orig_only=True)
                rename_dict[col.load_key] = variable.name
                col.variable = variable
        df.rename(columns=rename_dict, inplace=True)
示例#4
0
    def duplicate_columns_for_calculations_assign_series(
            self, df: pd.DataFrame) -> pd.DataFrame:
        # TODO [#39]: more efficient implementation of loading variables for calculations
        #
        # The `DataLoader` checks what variables are needed for calculations that are not
        # included in `load_variables`, and if it requires multiple transformations of
        # a variable, then it copies that series for as many transformations are needed.
        # It would be better to have an implementation that doesn't require carrying copies
        # through everything.
        logger.debug(
            f'Duplicating columns for calculation in source {self.source.name} in loader {self}'
        )
        for col in self.source._columns_for_calculate:
            # Extra column is already in source, but need to add to df
            self.source._create_series_in_df_for_calculation(df, col)

        if not self.source.load_variables:
            return df

        # Now need to see if we need multiple transformations of a variable, then also copy that column
        # so that it won't get used up for just one of the transformations/original variable
        unique_var_keys: Dict[str, Variable] = {}
        for var in self.source.load_variables:
            # If the variable is needed multiple times, but not because of calculations (multiple transforms)
            if var.key in unique_var_keys and var not in self.source._vars_for_calculate:
                if var.calculation is not None:
                    # This calculated variable needs to be duplicated, but it has not been calculated yet.
                    # Therefore add to a list of variables which need to be duplicated after calculation.
                    if var.key not in self._calculated_variables_that_need_duplication:
                        self._calculated_variables_that_need_duplication[
                            var.key] = (unique_var_keys[var.key], var)
                    continue
                # Got a variable multiple times, duplicate the column
                # Use the original variable for duplication as the column will already exist for that variable
                self.source._duplicate_column_for_calculation(
                    df,
                    orig_var=unique_var_keys[var.key],
                    new_var=var,
                )
            else:
                # Not a repeated variable, just add to tracking dict
                unique_var_keys[var.key] = var

        # Reorder df to be the same order as passed load variables
        col_order: Dict[str, int] = {}
        for col in self.source.columns:
            if col.variable.key not in self.source.load_var_keys:
                # Must be an unloaded column, skip it
                continue
            if col.variable.calculation is not None:
                # Calculated variables won't be in the data yet, skip them
                continue
            order = self.source.load_var_keys.index(col.variable.key)
            col_order[col.load_key] = order
        order_tups = list(col_order.items())
        order_tups.sort(key=lambda x: x[1])
        col_keys = [key for key, order in order_tups]
        df = df[col_keys]

        return df
示例#5
0
 def pipeline_obj_last_modified(
         self
 ) -> Tuple['LinkedLastModifiedItem', Optional[datetime.datetime]]:
     logger.debug(f'Determining pipeline object last modified for {self}')
     lm = _nested_most_recent_obj_last_modified(None, self, self)
     logger.debug(
         f'Finished determining pipeline object last modified for {self}')
     return lm
示例#6
0
 def _load(self):
     logger.debug(f'Started loading source {self}')
     hooks.on_begin_load_source(self)
     if not hasattr(self, 'data_loader'):
         self._set_data_loader(self.loader_class, pipeline=self.pipeline, **self.read_file_kwargs)
     df = self.data_loader()
     df = hooks.on_end_load_source(self, df)
     logger.debug(f'Finished loading source {self}')
     return df
示例#7
0
 def output(self):
     logger.debug(f'Outputting source {self.source.name} from outputter {self}')
     if self.preserve_original:
         df = deepcopy(self.source.df)
     else:
         df = self.source.df
     self.rename_columns(df)
     self.keep_necessary_cols(df)
     self.output_to_location(df)
     logger.debug(f'Finished outputting source {self.source.name} from outputter {self}')
示例#8
0
    def drop_variables(self, df: pd.DataFrame):
        if not self.source._vars_for_calculate:
            # Only need to drop if extra variables were loaded for calculations
            return

        drop_names = [var.name for var in self.source._vars_for_calculate]
        logger.debug(
            f'Dropping variables {drop_names} in df for {self.source.name} in loader {self}'
        )
        df.drop(drop_names, axis=1, inplace=True)
示例#9
0
 def refresh_columns_series(self):
     logger.debug(f'Refreshing columns series in source {self.name}')
     if self._df is None or self.columns is None:
         return
     for col in self.columns:
         if col.variable not in self.load_variables:
             continue
         if col.variable.name not in list(self._df.columns) + list(self._df.index.names):
             col.series = None
             continue
         series = self.get_series_for(var=col.variable)
         col.series = series
示例#10
0
 def _wipe_columns_series(self):
     logger.debug(f'Wiping columns series in source {self.name}')
     cols_attrs = [
         'columns',
         '_orig_columns',
         '_columns_for_calculate',
     ]
     for col_attr in cols_attrs:
         cols = getattr(self, col_attr)
         if cols is not None:
             for col in cols:
                 col.series = None
示例#11
0
 def duplicate_calculated_columns_if_necessary(self, df: pd.DataFrame):
     for var_key, (
             orig_var, new_var
     ) in self._calculated_variables_that_need_duplication.items():
         logger.debug(
             f'Duplicating column for original var {orig_var} new var '
             f'{new_var} for {self.source.name} in loader {self}')
         self.source._duplicate_column_for_calculation(
             df,
             orig_var=orig_var,
             new_var=new_var,
             pre_rename=False,
         )
示例#12
0
    def try_to_calculate_variables(self, df: pd.DataFrame):
        logger.debug(
            f'Trying to calculate variables for source {self.source.name} in loader {self}'
        )
        if not self.source.columns:
            return df

        # Create temporary source so that transform can have access to df and all columns with one object
        self.source.df = df

        for variable in self.source.load_variables:
            if variable.key in self.source.col_var_keys:
                # Variable already exists in the data, either from original source or previously calculated
                continue

            if variable.calculation is None:
                raise ValueError(
                    f'passed variable {variable} but not calculated and not '
                    f'in columns {self.source.columns}')
            required_variables = variable.calculation.variables
            has_all_required_variables = True
            calc_with_cols = []
            for req_var in required_variables:
                if not has_all_required_variables:
                    break
                col = self.source.col_for(req_var)
                calc_with_cols.append(col)
                col_pre_applied_transform_keys = deepcopy(
                    col.applied_transform_keys)
                for transform in req_var.applied_transforms:
                    # Need to make sure all the same transforms have been applied to
                    # the column before the calculation
                    if transform.key in col_pre_applied_transform_keys:
                        col_pre_applied_transform_keys.remove(transform.key)
                    else:
                        has_all_required_variables = False
                        break

            if has_all_required_variables:
                # Actually do calculation
                new_series = variable.calculation.func(calc_with_cols)
                new_series.name = variable.name
                # TODO [#34]: determine how to set index for columns from calculated variables
                new_col = Column(variable,
                                 dtype=str(new_series.dtype),
                                 series=new_series)
                self.source.df[variable.name] = new_series
                self.source.columns.append(new_col)

        return self.source.df
示例#13
0
    def _set_data_loader(self, data_loader_class: Type[DataLoader], pipeline: SourceCreatingPipeline = None,
                         **read_file_kwargs):
        logger.debug(f'Setting data loader for source {self.name}')
        run_pipeline = False
        if pipeline is not None:

            reason = LoadFromPipelineReason.PIPELINE_NEWER
            if self.data_exists_at_location:
                # if a source in the pipeline to create this data source was modified
                # more recently than this data source
                source_lm = self.last_modified
                pipeline_lm = self.pipeline_last_modified
                lm = most_recent_last_modified(source_lm, pipeline_lm)
                if lm is None:
                    run_pipeline = True
                elif lm == pipeline_lm and lm != source_lm:
                    run_pipeline = True

                if pipeline_lm is None:
                    reason = LoadFromPipelineReason.NO_LAST_MODIFIED_IN_PIPELINE
                elif source_lm is None:
                    reason = LoadFromPipelineReason.NO_DATA_AT_LOCATION
            else:
                # No location or no data at location, must run pipeline regardless of last modified
                run_pipeline = True

            if run_pipeline:
                # a prior source used to construct this data source has changed. need to re run pipeline
                report_load_from_pipeline_reason(self, pipeline, reason)

            # otherwise, don't need to worry about pipeline, continue handling

        loader = data_loader_class(self, read_file_kwargs, self.optimize_size)

        # If necessary, run pipeline before loading
        # Still necessary to use loader as may be transforming the data
        if run_pipeline:
            def run_pipeline_then_load(pipeline: SourceCreatingPipeline):
                logger.info(f'Running pipeline then loading source {self.name}')
                pipeline.execute() # outputs to file
                result = loader.load_from_existing_source(
                    pipeline.result,
                    preserve_original=not pipeline.allow_modifying_result
                )
                return result
            self.data_loader = partial(run_pipeline_then_load, self.pipeline)
        else:
            self.data_loader = loader.load_from_location
        logger.debug(f'Finished setting data loader {self.data_loader} for source {self.name}')
示例#14
0
 def get_operation(self,
                   pipeline: 'DataPipeline',
                   *args,
                   include_pipeline_in_result: bool = False,
                   **kwargs) -> DataOperation:
     logger.debug(f'Getting operation from options {self}')
     all_kwargs = {}
     if self.result_kwargs is not None:
         all_kwargs.update(self.result_kwargs)
     all_kwargs.update(kwargs)
     return self.op_class(
         pipeline,
         *args,
         include_pipeline_in_result=include_pipeline_in_result,
         **all_kwargs)
示例#15
0
 def assign_series_to_columns(self, df: pd.DataFrame):
     logger.debug(
         f'Assigning series to columns for source {self.source.name} in loader {self}'
     )
     if not self.source.columns:
         return
     for var in self.source.load_variables:
         if var.key not in self.source.col_var_keys:
             if var.calculation is None:
                 raise ValueError(
                     f'passed variable {var} but not calculated and not '
                     f'in columns {self.source.columns}')
             continue
         col = self.source.col_for(var)
         series = self.source.get_series_for(var=var, df=df)
         col.series = series
示例#16
0
 def reset(self):
     """
     Undo any changes made through the options interface
     :return:
     """
     logger.debug(f"Resetting datacode options")
     for (klass, attr), orig_value in self._orig_class_attrs.items():
         if orig_value.attr_existed:
             setattr(
                 klass,
                 attr,
                 orig_value.value,
             )
         else:
             delattr(klass, attr)
     self._orig_class_attrs = {}
示例#17
0
 def apply_calculations_transforms_and_drops(self, df: pd.DataFrame):
     logger.debug(
         f'Applying calculations, transforms, and drops for for source {self.source.name} in loader {self}'
     )
     self.assign_series_to_columns(df)
     df = self.pre_calculate(df)
     df = self.try_to_calculate_variables(df)
     self.duplicate_calculated_columns_if_necessary(df)
     df = self.pre_transform(df)
     df = self.apply_transforms(df)
     df = self.post_transform(df)
     df = self.try_to_calculate_variables(df)
     self.assign_series_to_columns(df)
     self.drop_variables(df)
     df = self.post_load(df)
     logger.debug(f'Finished applying calculations, transforms, and drops '
                  f'for for source {self.source.name} in loader {self}')
     return df
示例#18
0
    def execute(self, output: bool = True):
        self._pre_execute_hash_dict = self.hash_dict()
        logger.debug(f'Executing pipeline {self}')
        hooks.on_begin_execute_pipeline(self)
        while True:
            try:
                self.next_operation()
            except LastOperationFinishedException:
                break

        self.result = self.operations[-1].result

        if output:
            self.output()

        hooks.on_end_execute_pipeline(self)
        logger.debug(f'Finished executing pipeline {self}')
        return self.result
示例#19
0
    def set_class_attr(self, class_name: str, attr: str,
                       value: Any) -> "DatacodeOptions":
        """
        Sets an attribute on a datacode class

        :param class_name: Name of a class in the main datacode namespace
        :param attr: Attribute to be updated on the class
        :param value: Value to set the attribute to
        :return: same options instance
        """
        import datacode as dc

        logger.debug(
            f"Setting datacode options for class attr {class_name}.{attr} to {value}"
        )

        klass = getattr(dc, class_name)
        self._set_class_attr(klass, attr, value)
        return self
示例#20
0
def model_str(structural_dict: Dict[Variable, Sequence[Variable]],
              measurement_dict: Dict[Variable, Sequence[Variable]],
              var_corr_groups: Sequence[Sequence[Variable]]) -> str:
    m_str = '# structural part\n'
    for y, x_vars in structural_dict.items():
        all_vars = [y, *x_vars]
        m_str += _vars_to_structural_str(all_vars)
        m_str += '\n'
    m_str += '\n# measurement part\n'
    for y, x_vars in measurement_dict.items():
        all_vars = [y, *x_vars]
        m_str += _vars_to_measurement_str(all_vars)
        m_str += '\n'
    m_str += '\n# correlations\n'
    for corr_group in var_corr_groups:
        m_str += _vars_to_correlated_str(corr_group)
        m_str += '\n'
    logger.debug(f'Created semopy model {m_str}')
    return m_str
示例#21
0
    def apply_transforms(self, df: pd.DataFrame) -> pd.DataFrame:
        if not self.source.columns:
            return df

        logger.debug(
            f'Applying transforms in {self.source.name} in loader {self}')
        # Assign df so can have access to all columns and data with one object
        self.source.df = df

        for var in self.source.load_variables:
            if not var.applied_transforms:
                continue
            if var.key not in self.source.col_var_keys:
                if var.calculation is None:
                    raise ValueError(
                        f'passed variable {var} but not calculated and not '
                        f'in columns {self.source.columns}')
                continue
            column = self.source.col_for(var)
            self.source = _apply_transforms_to_var(var, column, self.source)
        return self.source.df
示例#22
0
    def load_from_location(self) -> pd.DataFrame:
        """
        Used when df does not already exist in the source, loads it from location

        :return:
        """
        logger.debug(
            f'Loading source {self.source.name} from location {self.source.location} with {self}'
        )
        self.pre_read()
        df = self.read_file_into_df()
        df = self.post_read(df)
        logger.debug(
            f'Setting columns and index for source {self.source.name}')
        df = self.duplicate_columns_for_calculations_assign_series(df)
        self.rename_columns(df)
        df = self.post_rename(df)
        if self.optimize_size:
            df = self.optimize_df_size(df)
        self.set_df_index(df)
        logger.debug(
            f'Finished setting columns and index for source {self.source.name}'
        )
        df = self.apply_calculations_transforms_and_drops(df)

        return df
示例#23
0
    def output(self):

        if self.result is None:
            return
        if isinstance(self.operations[-1], LoadOperation):
            # No reason to output if operation was load, there would be no change
            return
        if isinstance(self.result, AnalysisResult):
            if not self.operation_options[-1].can_output:
                return
            logger.debug(
                f'Outputting analysis result {self.result} from pipeline {self.name}'
            )
            self.operation_options[-1].analysis_output_func(
                self.result, self.operation_options[-1].out_path)
            return
        if not isinstance(self.result, DataSource):
            raise NotImplementedError(
                f'have not implemented pipeline output for type {type(self.result)}'
            )
        self.result.location = self.location
        if not self.location:
            return

        logger.debug(
            f'Outputting data source result {self.result} from pipeline {self.name}'
        )
        # By default, save calculated variables, unless user explicitly passes to not save them
        # Essentially setting the opposite default versus working directly with the DataSource since
        # usually DataSource calculations are done on loading and it is assumed if the pipeline result
        # is being saved at all then it is likely an expensive calculation which the user doesn't
        # want to repeat on every load
        if 'save_calculated' not in self.result.data_outputter_kwargs:
            extra_kwargs = dict(save_calculated=True)
        else:
            extra_kwargs = {}

        self.result.output(**extra_kwargs)
示例#24
0
    def _duplicate_column_for_calculation(self, df: pd.DataFrame, orig_var: Variable, new_var: Variable,
                                          pre_rename: bool = True):
        logger.debug(f'Duplicating column for calculation in source {self.name} for '
                     f'orig variable {orig_var}, new variable {new_var}')
        # should get column which already has data for this variable
        existing_col = self.col_for(orig_var)

        if pre_rename:
            existing_var_name = existing_col.load_key
        else:
            existing_var_name = orig_var.name

        col = deepcopy(existing_col)
        col.variable = new_var

        if pre_rename:
            new_key = str(uuid.uuid4())  # temporary key for this variable
            df[new_key] = deepcopy(df[existing_var_name])
            col.load_key = new_key
        else:
            df[new_var.name] = deepcopy(df[existing_var_name])

        self.columns.append(col)
示例#25
0
    def execute(self):
        logger.debug(f'Checking whether {self} should be executed')
        if self._has_been_executed and not self.options.always_rerun:
            return

        logger.debug(f'Starting to execute {self}')
        hooks.on_begin_execute_operation(self)
        self._execute()
        self._has_been_executed = True
        hooks.on_end_execute_operation(self)
        logger.debug(f'Finished executing {self}')
示例#26
0
 def pipeline_last_modified(self) -> Optional[datetime.datetime]:
     logger.debug(f'Determining pipeline last modified for {self}')
     lm = _nested_most_recent_last_modified(None, self)
     logger.debug(f'Finished determining pipeline last modified for {self}')
     return lm
示例#27
0
 def touch(self):
     """
     Mark last_modified as now
     """
     logger.debug(f'Touching source {self.name}')
     self.last_modified = datetime.datetime.now()
示例#28
0
    def _create_operations(self, data_sources: DataSourcesOrPipelines,
                           options_list: List[OperationOptions]):
        logger.debug(f'Creating operations for pipeline {self.name}')
        force_rerun = any([op.always_rerun for op in options_list])

        if not force_rerun and self.result_is_cached:
            # Already have result with the same exact config from a prior run. Just load it
            if options_list[-1].op_class.num_required_sources == 0:
                orig_op = options_list[-1].get_operation(
                    self, options_list[-1], include_pipeline_in_result=True)
            elif options_list[-1].op_class.num_required_sources == 1:
                orig_op = options_list[-1].get_operation(
                    self, [data_sources[0]],
                    options_list[-1],
                    include_pipeline_in_result=True)
            elif options_list[-1].op_class.num_required_sources == 2:
                orig_op = options_list[-1].get_operation(
                    self,
                    data_sources,
                    options_list[-1],
                    include_pipeline_in_result=True)
            else:
                raise ValueError(
                    'DataPipeline cannot handle operations with more than two sources'
                )
            if isinstance(orig_op.result, DataSource):
                load_options = LoadOptions(
                    out_path=self.location,
                    allow_modifying_result=self.allow_modifying_result,
                    result_kwargs=options_list[-1].result_kwargs)
                load_operation = load_options.get_operation(
                    self, load_options, output_name=orig_op.output_name)
                return [load_operation]
            warnings.warn(
                f'No loading from file implemented for result type {type(orig_op.result)}, will always run pipeline'
            )

        if len(options_list) == 1:
            result_opts = {'include_pipeline_in_result': True}
        else:
            result_opts = {}

        if options_list[0].op_class.num_required_sources == 0:
            operations = [
                options_list[0].get_operation(self, options_list[0],
                                              **result_opts)
            ]
        elif options_list[0].op_class.num_required_sources == 1:
            operations = _get_operations_for_single(data_sources[0],
                                                    options_list[0], self,
                                                    **result_opts)
        elif options_list[0].op_class.num_required_sources == 2:
            operations = _get_operations_for_pair(data_sources[0],
                                                  data_sources[1],
                                                  options_list[0], self,
                                                  **result_opts)
        else:
            raise ValueError(
                'DataPipeline cannot handle operations with more than two sources'
            )

        if len(options_list) == 1:
            logger.debug(
                f'Created single operation for pipeline {self.name}: {operations[0]}'
            )
            return operations

        for i, options in enumerate(options_list[1:]):
            if i + 2 == len(options_list):
                # Include pipeline for last operation
                result_opts = {'include_pipeline_in_result': True}
            else:
                result_opts = {}

            if options.op_class.num_required_sources == 0:
                operations.append(
                    options.get_operation(self, options, **result_opts))
            elif options.op_class.num_required_sources == 1:
                operations += _get_operations_for_single(
                    operations[-1].result, options, self, **result_opts)
            elif options.op_class.num_required_sources == 2:
                operations += _get_operations_for_pair(operations[-1].result,
                                                       data_sources[i + 2],
                                                       options, self,
                                                       **result_opts)
            else:
                raise ValueError(
                    'DataPipeline cannot handle operations with more than two sources'
                )

        logger.debug(
            f'Created operations for pipeline {self.name}: {operations}')
        return operations
示例#29
0
 def output(self, **data_outputter_kwargs):
     logger.debug(f'Outputting source {self.name}')
     config_dict = deepcopy(self.data_outputter_kwargs)
     config_dict.update(**data_outputter_kwargs)
     outputter = self.outputter_class(self, **config_dict)
     outputter.output()
示例#30
0
 def last_modified(self) -> Optional[datetime.datetime]:
     logger.debug(f'Determining last_modified in pipeline {self.name}')
     lm = None
     for obj in self.operations:
         lm = most_recent_last_modified(lm, obj.last_modified)
     return lm