def _produce_column_metadata( self, inputs_metadata: metadata_base.DataMetadata, column_index: int, read_files: typing.Sequence[typing.Any], ) -> metadata_base.DataMetadata: column_metadata = inputs_metadata.select_columns([column_index]) column_metadata = column_metadata.update_column(0, { 'structural_type': self._file_structural_type, # Clear metadata useful for filename columns. 'location_base_uris': metadata_base.NO_VALUE, 'media_types': metadata_base.NO_VALUE, }) # It is not a filename anymore. column_metadata = column_metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/FileName') # At least one semantic type from listed semantic types should be set. semantic_types = column_metadata.query_column(0).get('semantic_types', []) if not set(semantic_types) & set(self._file_semantic_types): # Add the first one. column_metadata = column_metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), self._file_semantic_types[0]) for row_index, file in enumerate(read_files): # Copy metadata only if we have a container type. if isinstance(file, types.Container): column_metadata = file.metadata.copy_to(column_metadata, (), (row_index, 0)) column_metadata = column_metadata.compact(['name', 'structural_type', 'media_types', 'location_base_uris', 'semantic_types']) return column_metadata
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: columns_to_use = self._get_columns(inputs.metadata, self.hyperparams) _logger.debug(f'converting columns: {columns_to_use}') _logger.debug( f'converting columns: {inputs.iloc[:, columns_to_use].columns}') output = inputs.copy() for col in columns_to_use: output.iloc[:, col] = pd.to_numeric(output.iloc[:, col]) column_metadata = output.metadata.query( (metadata_base.ALL_ELEMENTS, col)) semantic_type = column_metadata.get('semantic_types', None) if 'http://schema.org/Integer' in semantic_type: output.metadata = output.metadata.update( (metadata_base.ALL_ELEMENTS, col), {'structural_type': int}) elif 'http://schema.org/Float' in semantic_type: output.metadata = output.metadata.update( (metadata_base.ALL_ELEMENTS, col), {'structural_type': float}) # What to do with missing values? # has_missing_value = pd.isnull(output.iloc[:, col]).sum() > 0 if self.hyperparams['drop_non_numeric_columns']: _logger.debug( f'dropping columns: {list(np.where(output.dtypes == object)[0])}' ) _logger.debug( f'dropping columns: {output.iloc[:, list(np.where(output.dtypes == object)[0])].columns}' ) # np.where returns int64 instead of int, D3M metadata checks for int numeric_colum_indices = [ int(x) for x in np.where(output.dtypes != object)[0] ] output = output.iloc[:, numeric_colum_indices] output.metadata = DataMetadata.select_columns( output.metadata, numeric_colum_indices) return base.CallResult(output)
def combine_columns_metadata( inputs: metadata_base.DataMetadata, column_indices: typing.Sequence[int], columns_list: typing.Sequence[metadata_base.DataMetadata], *, return_result: str, add_index_columns: bool, ) -> metadata_base.DataMetadata: """ Analogous to ``combine_columns`` but operates only on metadata. """ if return_result == 'append': outputs = inputs for columns in columns_list: outputs = outputs.append_columns(columns) elif return_result == 'replace': if not column_indices: return combine_columns_metadata( inputs, column_indices, columns_list, return_result='append', add_index_columns=add_index_columns) outputs = inputs columns_replaced = 0 for columns in columns_list: columns_length = columns.query_field( (metadata_base.ALL_ELEMENTS, ), 'dimension')['length'] if columns_replaced < len(column_indices): # It is OK if the slice of "column_indices" is shorter than "columns", Only those columns # listed in the slice will be replaced and others appended after the last replaced column. outputs = outputs.replace_columns( columns, column_indices[columns_replaced:columns_replaced + columns_length]) else: # We insert the rest of columns after the last columns we replaced. We know that "column_indices" # is non-empty and that the last item of "column_indices" points ot the last column we replaced # for those listed in "column_indices". We replaced more columns though, so we have to add the # difference, and then add 1 to insert after the last column. outputs = outputs.insert_columns( columns, column_indices[-1] + (columns_replaced - len(column_indices)) + 1) columns_replaced += columns_length if columns_replaced < len(column_indices): outputs = outputs.remove_columns( column_indices[columns_replaced:len(column_indices)]) elif return_result == 'new': if not any( columns_metadata.query_field( (metadata_base.ALL_ELEMENTS, ), 'dimension')['length'] for columns_metadata in columns_list): raise ValueError("No columns produced.") outputs = columns_list[0] for columns in columns_list[1:]: outputs = outputs.append_columns(columns) if add_index_columns: inputs_index_columns = inputs.get_index_columns() outputs_index_columns = outputs.get_index_columns() if inputs_index_columns and not outputs_index_columns: # Add index columns at the beginning. outputs = inputs.select_columns( inputs_index_columns).append_columns( outputs, use_right_metadata=True) else: raise exceptions.InvalidArgumentValueError( "\"return_result\" has an invalid value: {return_result}".format( return_result=return_result)) return outputs