def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): """ Select columns to fit. Args: inputs: Container DataFrame hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: list """ if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) use_columns = hyperparams['use_columns'] exclude_columns = hyperparams['exclude_columns'] columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use( inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column) return inputs.iloc[:, columns_to_produce], columns_to_produce
def _get_columns(self, inputs_metadata: metadata_base.DataMetadata, type_to_cast: type) -> typing.Sequence[int]: # https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/cast_to_type.py def can_use_column(column_index: int) -> bool: return self._can_use_column(inputs_metadata, column_index, type_to_cast) columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) if not columns_to_use: raise ValueError("No columns to be cast to type '{type}'.".format( type=type_to_cast)) # We prefer if all columns could be cast, not just specified columns, # so we warn always when there are columns which cannot be produced. elif columns_not_to_use: self.logger.warning( "Not all columns can be cast to type '%(type)s'. Skipping columns: %(columns)s", { 'type': type_to_cast, 'columns': columns_not_to_use, }) return columns_to_use
def _get_columns( self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: def can_use_column(column_index: int) -> bool: return self._can_use_column(inputs_metadata, column_index) columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( inputs_metadata, self.hyperparams["use_columns"], self.hyperparams["exclude_columns"], can_use_column, ) # We are OK if no columns ended up being read. # "base_utils.combine_columns" will throw an error if it cannot work with this. if self.hyperparams["use_columns"] and columns_not_to_use: self.logger.warning( "Not all specified columns contain filenames for supported media types. Skipping columns: %(columns)s", { "columns": columns_not_to_use, }, ) return columns_to_use
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): if not hyperparams['use_semantic_types']: return data, list(data.columns), list(range(len(data.columns))) metadata = data.metadata def can_produce_column(column_index: int) -> bool: accepted_semantic_types = set() accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) semantic_types = set(column_metadata.get('semantic_types', [])) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, use_columns=hyperparams[ 'use_outputs_columns'], exclude_columns= hyperparams[ 'exclude_outputs_columns'], can_use_column=can_produce_column) targets = [] if target_column_indices: targets = data.select_columns(target_column_indices) target_column_names = [] for idx in target_column_indices: target_column_names.append(data.columns[idx]) return targets, target_column_names, target_column_indices
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): """ Select columns to fit. Args: inputs: Container DataFrame hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: list """ if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=hyperparams['use_columns'], exclude_columns=hyperparams['exclude_columns'], can_use_column=can_produce_column) """ Encountered error: when hyperparams['use_columns'] = (2,3) and hyperparams['exclude_columns'] is (1,2) columns_to_produce is still [2] """ return inputs.iloc[:, columns_to_produce], columns_to_produce
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): """ Select columns to fit. Args: inputs: Container DataFrame hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: list """ if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) use_columns = [] exclude_columns = [] # if hyperparams['columns_using_method'] == 'name': # inputs_cols = inputs.columns.values.tolist() # for i in range(len(inputs_cols)): # if inputs_cols[i] in hyperparams['use_columns_name']: # use_columns.append(i) # elif inputs_cols[i] in hyperparams['exclude_columns_name']: # exclude_columns.append(i) # else: use_columns=hyperparams['use_columns'] exclude_columns=hyperparams['exclude_columns'] columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column) return inputs.iloc[:, columns_to_produce], columns_to_produce
def _get_columns( self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: """ originally from from d3m.primitives.schema_discovery.profiler.Common """ def can_use_column(column_index: int) -> bool: # if overwrite, we detect on all columns if self.hyperparams['overwrite']: return True return self._can_use_column(inputs_metadata, column_index) columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) # We are OK if no columns ended up being parsed. # "base_utils.combine_columns" will throw an error if it cannot work with this. if self.hyperparams['use_columns'] and columns_not_to_use: self.logger.warning( "Not all specified columns can parsed. Skipping columns: %(columns)s", { 'columns': columns_not_to_use, }) return columns_to_use
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): if not hyperparams['use_semantic_types']: columns_to_produce = list(range(len(inputs.columns))) else: inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use( inputs_metadata, use_columns=hyperparams['use_columns'], exclude_columns=hyperparams['exclude_columns'], can_use_column=can_produce_column) columns_to_drop = cls._get_columns_to_drop(inputs, columns_to_produce, hyperparams) for col in columns_to_drop: columns_to_produce.remove(col) return inputs.iloc[:, columns_to_produce], columns_to_produce, columns_to_drop
def _get_columns( cls, inputs_metadata: metadata_base.DataMetadata, hyperparams: hyperparams.Hyperparams) -> typing.Sequence[int]: def can_use_column(column_index: int) -> bool: return cls._can_use_column(inputs_metadata, column_index) columns_to_use, columns_not_to_use = d3m_utils.get_columns_to_use( inputs_metadata, hyperparams['use_columns'], hyperparams['exclude_columns'], can_use_column) return columns_to_use
def _get_columns_to_fit(cls, inputs: Input, hyperparams: UEncHyperparameter): if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) columns_to_produce, columns_not_to_produce = utils.get_columns_to_use( metadata=inputs_metadata, use_columns=hyperparams['use_columns'], exclude_columns=hyperparams['exclude_columns'], can_use_column=can_produce_column) return inputs.iloc[:, columns_to_produce], columns_to_produce
def get_columns_of_type(df, semantic_types): columns = df.metadata.list_columns_with_semantic_types(semantic_types) def can_use_column(column_index: int) -> bool: return column_index in columns # hyperparams['use_columns'], hyperparams['exclude_columns'] columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( df.metadata, [], [], can_use_column) # metadata, include, exclude_columns, idx_function if not columns_to_use: raise ValueError( "Input data has no columns matching semantic types: {semantic_types}" .format(semantic_types=semantic_types, )) return df.select_columns(columns_to_use)
def _get_outputs_columns(self, outputs_metadata: metadata_base.DataMetadata) -> List[int]: def can_use_column(column_index: int) -> bool: return self._can_use_outputs_column(outputs_metadata, column_index) columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(outputs_metadata, self.hyperparams['use_outputs_columns'], self.hyperparams['exclude_outputs_columns'], can_use_column) if not columns_to_use: if self.hyperparams['error_on_no_columns']: raise ValueError("No outputs columns.") else: self.logger.warning("No outputs columns.") if self.hyperparams['use_outputs_columns'] and columns_to_use and columns_not_to_use: self.logger.warning("Not all specified outputs columns can be used. Skipping columns: %(columns)s", { 'columns': columns_not_to_use, }) return columns_to_use
def _get_columns( self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: def can_use_column(column_index: int) -> bool: return self._can_use_column(inputs_metadata, column_index) columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) # We are OK if no columns ended up being parsed. # "base_utils.combine_columns" will throw an error if it cannot work with this. if self.hyperparams['use_columns'] and columns_not_to_use: self.logger.warning( "Not all specified columns can parsed. Skipping columns: %(columns)s", { 'columns': columns_not_to_use, }) return columns_to_use
def _get_columns( self, inputs_metadata: metadata_base.DataMetadata ) -> typing.List[int]: def can_use_column(column_index: int) -> bool: return True columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( inputs_metadata, self.hyperparams["use_columns"], self.hyperparams["exclude_columns"], can_use_column, ) if self.hyperparams["use_columns"] and columns_not_to_use: self.logger.warning( "Not all specified columns can parsed. Skipping columns: %(columns)s", { "columns": columns_not_to_use, }, ) return columns_to_use