def reset(self, forward: bool = False): logger.debug(f'Resetting source {self.name}') del self._df self._df = None if forward: for item in self.forward_links: item.reset(forward=True)
def _set_df_from_first_operation(self): logger.debug(f'Setting pipeline {self.name} df from first operation') # Need to check as may be generation pipeline which would not have a df to start from if (hasattr(self.operations[0], 'data_sources') and self.operations[0].data_sources and self.operations[0].num_required_sources > 0): self.df = self.operations[0].data_sources[0].df
def rename_columns(self, df: pd.DataFrame): from datacode.models.source import NoColumnForVariableException if not self.source.columns: return logger.debug( f'Renaming columns in {self.source.name} in loader {self}') rename_dict = {} for variable in self.source._orig_load_variables: if variable.key not in self.source.col_var_keys: if variable.calculation is None: raise ValueError( f'passed variable {variable} but not calculated and not ' f'in columns {self.source.columns}') continue col = self.source.col_for(variable) rename_dict[col.load_key] = variable.name col.variable = variable for variable in self.source._vars_for_calculate: try: col = self.source.col_for(variable, for_calculate_only=True) rename_dict[col.load_key] = variable.name col.variable = variable except NoColumnForVariableException: # Must be using a pre-existing column rather than a newly generated column, need to rename that instead col = self.source.col_for(variable, orig_only=True) rename_dict[col.load_key] = variable.name col.variable = variable df.rename(columns=rename_dict, inplace=True)
def duplicate_columns_for_calculations_assign_series( self, df: pd.DataFrame) -> pd.DataFrame: # TODO [#39]: more efficient implementation of loading variables for calculations # # The `DataLoader` checks what variables are needed for calculations that are not # included in `load_variables`, and if it requires multiple transformations of # a variable, then it copies that series for as many transformations are needed. # It would be better to have an implementation that doesn't require carrying copies # through everything. logger.debug( f'Duplicating columns for calculation in source {self.source.name} in loader {self}' ) for col in self.source._columns_for_calculate: # Extra column is already in source, but need to add to df self.source._create_series_in_df_for_calculation(df, col) if not self.source.load_variables: return df # Now need to see if we need multiple transformations of a variable, then also copy that column # so that it won't get used up for just one of the transformations/original variable unique_var_keys: Dict[str, Variable] = {} for var in self.source.load_variables: # If the variable is needed multiple times, but not because of calculations (multiple transforms) if var.key in unique_var_keys and var not in self.source._vars_for_calculate: if var.calculation is not None: # This calculated variable needs to be duplicated, but it has not been calculated yet. # Therefore add to a list of variables which need to be duplicated after calculation. if var.key not in self._calculated_variables_that_need_duplication: self._calculated_variables_that_need_duplication[ var.key] = (unique_var_keys[var.key], var) continue # Got a variable multiple times, duplicate the column # Use the original variable for duplication as the column will already exist for that variable self.source._duplicate_column_for_calculation( df, orig_var=unique_var_keys[var.key], new_var=var, ) else: # Not a repeated variable, just add to tracking dict unique_var_keys[var.key] = var # Reorder df to be the same order as passed load variables col_order: Dict[str, int] = {} for col in self.source.columns: if col.variable.key not in self.source.load_var_keys: # Must be an unloaded column, skip it continue if col.variable.calculation is not None: # Calculated variables won't be in the data yet, skip them continue order = self.source.load_var_keys.index(col.variable.key) col_order[col.load_key] = order order_tups = list(col_order.items()) order_tups.sort(key=lambda x: x[1]) col_keys = [key for key, order in order_tups] df = df[col_keys] return df
def pipeline_obj_last_modified( self ) -> Tuple['LinkedLastModifiedItem', Optional[datetime.datetime]]: logger.debug(f'Determining pipeline object last modified for {self}') lm = _nested_most_recent_obj_last_modified(None, self, self) logger.debug( f'Finished determining pipeline object last modified for {self}') return lm
def _load(self): logger.debug(f'Started loading source {self}') hooks.on_begin_load_source(self) if not hasattr(self, 'data_loader'): self._set_data_loader(self.loader_class, pipeline=self.pipeline, **self.read_file_kwargs) df = self.data_loader() df = hooks.on_end_load_source(self, df) logger.debug(f'Finished loading source {self}') return df
def output(self): logger.debug(f'Outputting source {self.source.name} from outputter {self}') if self.preserve_original: df = deepcopy(self.source.df) else: df = self.source.df self.rename_columns(df) self.keep_necessary_cols(df) self.output_to_location(df) logger.debug(f'Finished outputting source {self.source.name} from outputter {self}')
def drop_variables(self, df: pd.DataFrame): if not self.source._vars_for_calculate: # Only need to drop if extra variables were loaded for calculations return drop_names = [var.name for var in self.source._vars_for_calculate] logger.debug( f'Dropping variables {drop_names} in df for {self.source.name} in loader {self}' ) df.drop(drop_names, axis=1, inplace=True)
def refresh_columns_series(self): logger.debug(f'Refreshing columns series in source {self.name}') if self._df is None or self.columns is None: return for col in self.columns: if col.variable not in self.load_variables: continue if col.variable.name not in list(self._df.columns) + list(self._df.index.names): col.series = None continue series = self.get_series_for(var=col.variable) col.series = series
def _wipe_columns_series(self): logger.debug(f'Wiping columns series in source {self.name}') cols_attrs = [ 'columns', '_orig_columns', '_columns_for_calculate', ] for col_attr in cols_attrs: cols = getattr(self, col_attr) if cols is not None: for col in cols: col.series = None
def duplicate_calculated_columns_if_necessary(self, df: pd.DataFrame): for var_key, ( orig_var, new_var ) in self._calculated_variables_that_need_duplication.items(): logger.debug( f'Duplicating column for original var {orig_var} new var ' f'{new_var} for {self.source.name} in loader {self}') self.source._duplicate_column_for_calculation( df, orig_var=orig_var, new_var=new_var, pre_rename=False, )
def try_to_calculate_variables(self, df: pd.DataFrame): logger.debug( f'Trying to calculate variables for source {self.source.name} in loader {self}' ) if not self.source.columns: return df # Create temporary source so that transform can have access to df and all columns with one object self.source.df = df for variable in self.source.load_variables: if variable.key in self.source.col_var_keys: # Variable already exists in the data, either from original source or previously calculated continue if variable.calculation is None: raise ValueError( f'passed variable {variable} but not calculated and not ' f'in columns {self.source.columns}') required_variables = variable.calculation.variables has_all_required_variables = True calc_with_cols = [] for req_var in required_variables: if not has_all_required_variables: break col = self.source.col_for(req_var) calc_with_cols.append(col) col_pre_applied_transform_keys = deepcopy( col.applied_transform_keys) for transform in req_var.applied_transforms: # Need to make sure all the same transforms have been applied to # the column before the calculation if transform.key in col_pre_applied_transform_keys: col_pre_applied_transform_keys.remove(transform.key) else: has_all_required_variables = False break if has_all_required_variables: # Actually do calculation new_series = variable.calculation.func(calc_with_cols) new_series.name = variable.name # TODO [#34]: determine how to set index for columns from calculated variables new_col = Column(variable, dtype=str(new_series.dtype), series=new_series) self.source.df[variable.name] = new_series self.source.columns.append(new_col) return self.source.df
def _set_data_loader(self, data_loader_class: Type[DataLoader], pipeline: SourceCreatingPipeline = None, **read_file_kwargs): logger.debug(f'Setting data loader for source {self.name}') run_pipeline = False if pipeline is not None: reason = LoadFromPipelineReason.PIPELINE_NEWER if self.data_exists_at_location: # if a source in the pipeline to create this data source was modified # more recently than this data source source_lm = self.last_modified pipeline_lm = self.pipeline_last_modified lm = most_recent_last_modified(source_lm, pipeline_lm) if lm is None: run_pipeline = True elif lm == pipeline_lm and lm != source_lm: run_pipeline = True if pipeline_lm is None: reason = LoadFromPipelineReason.NO_LAST_MODIFIED_IN_PIPELINE elif source_lm is None: reason = LoadFromPipelineReason.NO_DATA_AT_LOCATION else: # No location or no data at location, must run pipeline regardless of last modified run_pipeline = True if run_pipeline: # a prior source used to construct this data source has changed. need to re run pipeline report_load_from_pipeline_reason(self, pipeline, reason) # otherwise, don't need to worry about pipeline, continue handling loader = data_loader_class(self, read_file_kwargs, self.optimize_size) # If necessary, run pipeline before loading # Still necessary to use loader as may be transforming the data if run_pipeline: def run_pipeline_then_load(pipeline: SourceCreatingPipeline): logger.info(f'Running pipeline then loading source {self.name}') pipeline.execute() # outputs to file result = loader.load_from_existing_source( pipeline.result, preserve_original=not pipeline.allow_modifying_result ) return result self.data_loader = partial(run_pipeline_then_load, self.pipeline) else: self.data_loader = loader.load_from_location logger.debug(f'Finished setting data loader {self.data_loader} for source {self.name}')
def get_operation(self, pipeline: 'DataPipeline', *args, include_pipeline_in_result: bool = False, **kwargs) -> DataOperation: logger.debug(f'Getting operation from options {self}') all_kwargs = {} if self.result_kwargs is not None: all_kwargs.update(self.result_kwargs) all_kwargs.update(kwargs) return self.op_class( pipeline, *args, include_pipeline_in_result=include_pipeline_in_result, **all_kwargs)
def assign_series_to_columns(self, df: pd.DataFrame): logger.debug( f'Assigning series to columns for source {self.source.name} in loader {self}' ) if not self.source.columns: return for var in self.source.load_variables: if var.key not in self.source.col_var_keys: if var.calculation is None: raise ValueError( f'passed variable {var} but not calculated and not ' f'in columns {self.source.columns}') continue col = self.source.col_for(var) series = self.source.get_series_for(var=var, df=df) col.series = series
def reset(self): """ Undo any changes made through the options interface :return: """ logger.debug(f"Resetting datacode options") for (klass, attr), orig_value in self._orig_class_attrs.items(): if orig_value.attr_existed: setattr( klass, attr, orig_value.value, ) else: delattr(klass, attr) self._orig_class_attrs = {}
def apply_calculations_transforms_and_drops(self, df: pd.DataFrame): logger.debug( f'Applying calculations, transforms, and drops for for source {self.source.name} in loader {self}' ) self.assign_series_to_columns(df) df = self.pre_calculate(df) df = self.try_to_calculate_variables(df) self.duplicate_calculated_columns_if_necessary(df) df = self.pre_transform(df) df = self.apply_transforms(df) df = self.post_transform(df) df = self.try_to_calculate_variables(df) self.assign_series_to_columns(df) self.drop_variables(df) df = self.post_load(df) logger.debug(f'Finished applying calculations, transforms, and drops ' f'for for source {self.source.name} in loader {self}') return df
def execute(self, output: bool = True): self._pre_execute_hash_dict = self.hash_dict() logger.debug(f'Executing pipeline {self}') hooks.on_begin_execute_pipeline(self) while True: try: self.next_operation() except LastOperationFinishedException: break self.result = self.operations[-1].result if output: self.output() hooks.on_end_execute_pipeline(self) logger.debug(f'Finished executing pipeline {self}') return self.result
def set_class_attr(self, class_name: str, attr: str, value: Any) -> "DatacodeOptions": """ Sets an attribute on a datacode class :param class_name: Name of a class in the main datacode namespace :param attr: Attribute to be updated on the class :param value: Value to set the attribute to :return: same options instance """ import datacode as dc logger.debug( f"Setting datacode options for class attr {class_name}.{attr} to {value}" ) klass = getattr(dc, class_name) self._set_class_attr(klass, attr, value) return self
def model_str(structural_dict: Dict[Variable, Sequence[Variable]], measurement_dict: Dict[Variable, Sequence[Variable]], var_corr_groups: Sequence[Sequence[Variable]]) -> str: m_str = '# structural part\n' for y, x_vars in structural_dict.items(): all_vars = [y, *x_vars] m_str += _vars_to_structural_str(all_vars) m_str += '\n' m_str += '\n# measurement part\n' for y, x_vars in measurement_dict.items(): all_vars = [y, *x_vars] m_str += _vars_to_measurement_str(all_vars) m_str += '\n' m_str += '\n# correlations\n' for corr_group in var_corr_groups: m_str += _vars_to_correlated_str(corr_group) m_str += '\n' logger.debug(f'Created semopy model {m_str}') return m_str
def apply_transforms(self, df: pd.DataFrame) -> pd.DataFrame: if not self.source.columns: return df logger.debug( f'Applying transforms in {self.source.name} in loader {self}') # Assign df so can have access to all columns and data with one object self.source.df = df for var in self.source.load_variables: if not var.applied_transforms: continue if var.key not in self.source.col_var_keys: if var.calculation is None: raise ValueError( f'passed variable {var} but not calculated and not ' f'in columns {self.source.columns}') continue column = self.source.col_for(var) self.source = _apply_transforms_to_var(var, column, self.source) return self.source.df
def load_from_location(self) -> pd.DataFrame: """ Used when df does not already exist in the source, loads it from location :return: """ logger.debug( f'Loading source {self.source.name} from location {self.source.location} with {self}' ) self.pre_read() df = self.read_file_into_df() df = self.post_read(df) logger.debug( f'Setting columns and index for source {self.source.name}') df = self.duplicate_columns_for_calculations_assign_series(df) self.rename_columns(df) df = self.post_rename(df) if self.optimize_size: df = self.optimize_df_size(df) self.set_df_index(df) logger.debug( f'Finished setting columns and index for source {self.source.name}' ) df = self.apply_calculations_transforms_and_drops(df) return df
def output(self): if self.result is None: return if isinstance(self.operations[-1], LoadOperation): # No reason to output if operation was load, there would be no change return if isinstance(self.result, AnalysisResult): if not self.operation_options[-1].can_output: return logger.debug( f'Outputting analysis result {self.result} from pipeline {self.name}' ) self.operation_options[-1].analysis_output_func( self.result, self.operation_options[-1].out_path) return if not isinstance(self.result, DataSource): raise NotImplementedError( f'have not implemented pipeline output for type {type(self.result)}' ) self.result.location = self.location if not self.location: return logger.debug( f'Outputting data source result {self.result} from pipeline {self.name}' ) # By default, save calculated variables, unless user explicitly passes to not save them # Essentially setting the opposite default versus working directly with the DataSource since # usually DataSource calculations are done on loading and it is assumed if the pipeline result # is being saved at all then it is likely an expensive calculation which the user doesn't # want to repeat on every load if 'save_calculated' not in self.result.data_outputter_kwargs: extra_kwargs = dict(save_calculated=True) else: extra_kwargs = {} self.result.output(**extra_kwargs)
def _duplicate_column_for_calculation(self, df: pd.DataFrame, orig_var: Variable, new_var: Variable, pre_rename: bool = True): logger.debug(f'Duplicating column for calculation in source {self.name} for ' f'orig variable {orig_var}, new variable {new_var}') # should get column which already has data for this variable existing_col = self.col_for(orig_var) if pre_rename: existing_var_name = existing_col.load_key else: existing_var_name = orig_var.name col = deepcopy(existing_col) col.variable = new_var if pre_rename: new_key = str(uuid.uuid4()) # temporary key for this variable df[new_key] = deepcopy(df[existing_var_name]) col.load_key = new_key else: df[new_var.name] = deepcopy(df[existing_var_name]) self.columns.append(col)
def execute(self): logger.debug(f'Checking whether {self} should be executed') if self._has_been_executed and not self.options.always_rerun: return logger.debug(f'Starting to execute {self}') hooks.on_begin_execute_operation(self) self._execute() self._has_been_executed = True hooks.on_end_execute_operation(self) logger.debug(f'Finished executing {self}')
def pipeline_last_modified(self) -> Optional[datetime.datetime]: logger.debug(f'Determining pipeline last modified for {self}') lm = _nested_most_recent_last_modified(None, self) logger.debug(f'Finished determining pipeline last modified for {self}') return lm
def touch(self): """ Mark last_modified as now """ logger.debug(f'Touching source {self.name}') self.last_modified = datetime.datetime.now()
def _create_operations(self, data_sources: DataSourcesOrPipelines, options_list: List[OperationOptions]): logger.debug(f'Creating operations for pipeline {self.name}') force_rerun = any([op.always_rerun for op in options_list]) if not force_rerun and self.result_is_cached: # Already have result with the same exact config from a prior run. Just load it if options_list[-1].op_class.num_required_sources == 0: orig_op = options_list[-1].get_operation( self, options_list[-1], include_pipeline_in_result=True) elif options_list[-1].op_class.num_required_sources == 1: orig_op = options_list[-1].get_operation( self, [data_sources[0]], options_list[-1], include_pipeline_in_result=True) elif options_list[-1].op_class.num_required_sources == 2: orig_op = options_list[-1].get_operation( self, data_sources, options_list[-1], include_pipeline_in_result=True) else: raise ValueError( 'DataPipeline cannot handle operations with more than two sources' ) if isinstance(orig_op.result, DataSource): load_options = LoadOptions( out_path=self.location, allow_modifying_result=self.allow_modifying_result, result_kwargs=options_list[-1].result_kwargs) load_operation = load_options.get_operation( self, load_options, output_name=orig_op.output_name) return [load_operation] warnings.warn( f'No loading from file implemented for result type {type(orig_op.result)}, will always run pipeline' ) if len(options_list) == 1: result_opts = {'include_pipeline_in_result': True} else: result_opts = {} if options_list[0].op_class.num_required_sources == 0: operations = [ options_list[0].get_operation(self, options_list[0], **result_opts) ] elif options_list[0].op_class.num_required_sources == 1: operations = _get_operations_for_single(data_sources[0], options_list[0], self, **result_opts) elif options_list[0].op_class.num_required_sources == 2: operations = _get_operations_for_pair(data_sources[0], data_sources[1], options_list[0], self, **result_opts) else: raise ValueError( 'DataPipeline cannot handle operations with more than two sources' ) if len(options_list) == 1: logger.debug( f'Created single operation for pipeline {self.name}: {operations[0]}' ) return operations for i, options in enumerate(options_list[1:]): if i + 2 == len(options_list): # Include pipeline for last operation result_opts = {'include_pipeline_in_result': True} else: result_opts = {} if options.op_class.num_required_sources == 0: operations.append( options.get_operation(self, options, **result_opts)) elif options.op_class.num_required_sources == 1: operations += _get_operations_for_single( operations[-1].result, options, self, **result_opts) elif options.op_class.num_required_sources == 2: operations += _get_operations_for_pair(operations[-1].result, data_sources[i + 2], options, self, **result_opts) else: raise ValueError( 'DataPipeline cannot handle operations with more than two sources' ) logger.debug( f'Created operations for pipeline {self.name}: {operations}') return operations
def output(self, **data_outputter_kwargs): logger.debug(f'Outputting source {self.name}') config_dict = deepcopy(self.data_outputter_kwargs) config_dict.update(**data_outputter_kwargs) outputter = self.outputter_class(self, **config_dict) outputter.output()
def last_modified(self) -> Optional[datetime.datetime]: logger.debug(f'Determining last_modified in pipeline {self.name}') lm = None for obj in self.operations: lm = most_recent_last_modified(lm, obj.last_modified) return lm